known issues updated

master
Gutenberg back end service account 2020-08-27 16:33:51 -04:00
parent f95b2c292f
commit 8e941036c1
3 changed files with 278 additions and 0 deletions

120
robot/harvest.php Normal file
View File

@ -0,0 +1,120 @@
<?php
# gbn: Our high-speed mirrors. Sailor is gone. 20170417
# $mirror = 'http://www.gutenberg.lib.md.us/';
$mirror = 'http://aleph.gutenberg.org/';
# $epub_mirror = 'http://gutenberg.pglaf.org/';
$epub_mirror = 'http://aleph.gutenberg.org/';
include_once ("pgcat.phh");
$limit = 100;
getint ("offset", 0);
getarray ("filetypes");
getarray ("langs");
// sanitize array
foreach ($filetypes as $ft) {
if (!preg_match ("/^[.a-z0-9]+$/i", $ft)) {
header("HTTP/1.0 404 Not Found");
p ("Malformed filetype.");
exit ();
}
}
foreach ($langs as $l) {
if (!preg_match ("/^[a-z0-9]+$/i", $l)) {
header("HTTP/1.0 404 Not Found");
p ("Malformed language code.");
exit ();
}
}
$db = $config->db ();
$fts = join (" ", $filetypes);
if (strlen ($fts)) {
$fts = ", filetypes: $fts";
}
$ls = join (" ", $langs);
if (strlen ($ls)) {
$ls = ", languages: $ls";
}
$caption = "All Files (offset: $offset$fts$ls)";
echo <<< EOT
<!DOCTYPE HTML PUBLIC "$config->dtd_public" "$config->dtd_system">
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>$caption - Project Gutenberg</title>
</head>
<body>
<h1>$caption</h1>
EOT;
$where_filetypes = join ("','", $filetypes);
if (strlen ($where_filetypes)) {
$where_filetypes = " and fk_filetypes in ('$where_filetypes') ";
}
$where_languages = join ("','", $langs);
if (strlen ($where_languages)) {
$where_languages = " and fk_langs in ('$where_languages') ";
}
$db->exec ($sql = "select pk, files.fk_books, fk_langs, filename
from files left join mn_books_langs
on mn_books_langs.fk_books = files.fk_books
where (fk_compressions = 'zip' or fk_filetypes in
('epub.images', 'epub.noimages', 'kindle.images', 'kindle.noimages', 'mp3'))
$where_filetypes
$where_languages
and diskstatus != 5 and obsoleted = 0
and pk > $offset order by pk limit $limit;");
// p ($sql);
if ($db->FirstRow ()) {
do {
$filename = $db->get ("filename", SQLCHAR);
$offset = $db->get ("pk", SQLINT);
$fk_books = $db->get ("fk_books", SQLINT);
/*
$dir = etext2dir ($fk_books);
if (preg_match ("!^$dir!", $filename)) {
$symlink = preg_replace ("!^$dir!", "$config->files/$fk_books/", $filename);
} elseif (strncmp ($filename, "cache/", 6) == 0) {
$symlink = "/$filename";
} else {
$symlink = "$config->downloadbase/$filename";
} */
if (strpos ($filename, '/epub/') !== false) {
// gbn: For change to aleph.gutenberg.org:
// $filename = str_replace ('/epub/', '/generated/', $filename);
$symlink = $epub_mirror . $filename;
} else {
$symlink = $mirror . $filename;
}
p ("<a href=\"$symlink\">$symlink</a>");
} while ($db->NextRow ());
$url = "harvest?offset=$offset";
foreach ($filetypes as $filetype) {
$url .= "&amp;filetypes[]=$filetype";
}
foreach ($langs as $l) {
$url .= "&amp;langs[]=$l";
}
p ("<a href=\"$url\">Next Page</a>");
} else {
p ("No more files.");
}
echo (" </body>\n</html>\n");
?>

View File

@ -0,0 +1,78 @@
---
layout: default
title: Volunteers' FAQ | Project Gutenberg
permalink: /help/volunteers_faq.html
---
Volunteers' FAQ
===============
Project Gutenberg welcomes contributions of eBooks from people with
the interest, time, and skillset needed to meet our submission
standards. Details of the process and the standards are at our
copyright clearance site [copy.pglaf.org](https://copy.pglaf.org) and upload site
[upload.pglaf.org](https://upload.pglaf.org).
Join Distributed Proofreaders, Instead
--------------------------------------
For most people interested in producing eBooks, we recommend starting
with Distributed Proofreaders (https://www.pgdp.net). With
Distributed Proofreaders, you can get involved with different portions
of the production pipeline described below. This is a much easier way
to get started, and results in very high quality eBooks.
If you simply want to suggest a book for digitization, DP has online
forums for this, or you can simply send an email (contact information
is on the site).
Distributed Proofreaders maintains canonical guidance on production.
See especially:
* [The Post-Processing FAQ](https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Post-Processing_FAQ)
* [Easy Epub](https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Easy_Epub (It's a guide to how best to handle the HTML that goes through epubmaker to lead to passable epubs/mobis)
* [HTML Best Practices](https://www.pgdp.org/~jana/best-practices/ (this was written a while back but DP tries to keep it up-to-date)
Being a Solo Producer
---------------------
If you might be interested in producing an eBook yourself, without involving
Distributed Proofreaders, here is some guidance. But start with what's above,
including the DP links.
In a nutshell, the production process typically involves the following:
- Identify a candidate printed book. Confirm it is not already in the
collection, or in process by other volunteers. Use the [Collection
Development Policy](/policy/collection_development.html) to guide
you on eligibility.
- Obtain a copyright clearance for the printed book. Usually this is
based on scanned title page and verso page demonstrating the printed
book was published more than 95 years ago. See the [Copyright
How-To](/help/copyright.html).
- Obtain scans of the book. This may be done using your own scanner,
or there might be online scans available for reuse. Scans
must come from the exact same print edition as your copyright
clearance.
- Perform optical character recognition (OCR) on the scans, to make an
approximate representation of the book in plain text.
- Proofread, proofread, proofread: "Fix" the OCR output by carefully
fixing any errors it made. Remove page headers &
footers. De-hyphenate. Add back italics or other formatting.
- Format: Generate valid and well-formed HTML source. Different tools
are available for this, and usually involve editing the HTML source
code directly. Note that many tools produce convoluted, non-standard,
or non-valid HTML, which can be very difficult to clean up for Project
Gutenberg: poor HTML is not accepted, even if it is valid.
- Check, and recheck. The upload site has various tools, including to
test proper conversion to derived formats.
- Upload your work, using the copyright clearance key generated
earlier.
- Coordinate with the Project Gutenberg production volunteers (known
as "whitewashers," after the Mark Twain book) on final formatting and
presentation.
- Once the eBook is added to the Project Gutenberg collection, confirm
it is appearing correctly, and all metadata are correct.
- If possible, stay in touch into the future. If we receive errata
reports that require access to source material, or are stylistic or
subjective in nature, we might get in touch to discuss potential
changes.

View File

@ -0,0 +1,80 @@
---
layout: default
title: Volunteers' FAQ | Project Gutenberg
permalink: /help/volunteers_faq.html
---
Volunteers' FAQ
===============
Project Gutenberg welcomes contributions of eBooks from people with
the interest, time, and skillset needed to meet our submission
standards. Details of the process and the standards are at our
copyright clearance site (https://copy.pglaf.org) and upload site
(https://upload.pglaf.org).
Join Distributed Proofreaders, Instead
--------------------------------------
For most people interested in producing eBooks, we recommend starting
with Distributed Proofreaders (https://www.pgdp.net). With
Distributed Proofreaders, you can get involved with different portions
of the production pipeline described below. This is a much easier way
to get started, and results in very high quality eBooks.
If you simply want to suggest a book for digitization, DP has online
forums for this, or you can simply send an email (contact information
is on the site).
Distributed Proofreaders maintains canonical guidance on production.
See especially:
* The Post-Processing FAQ --
https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Post-Processing_FAQ
* Easy Epub -- https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Easy_Epub (It's a guide to how best to handle the HTML that goes through epubmaker to lead to passable epubs/mobis)
* HTML Best Practices -- https://www.pgdp.org/~jana/best-practices/ (this was written a while back but DP tries to keep it up-to-date)
Being a Solo Producer
---------------------
If you might be interested in producing an eBook yourself, without involving
Distributed Proofreaders, here is some guidance. But start with what's above,
including the DP links.
In a nutshell, the production process typically involves the following:
- Identify a candidate printed book. Confirm it is not already in the
collection, or in process by other volunteers. Use the [Collection
Development Policy](/policy/collection_development.html) to guide
you on eligibility.
- Obtain a copyright clearance for the printed book. Usually this is
based on scanned title page and verso page demonstrating the printed
book was published more than 95 years ago. See the [Copyright
How-To](/help/copyright.html).
- Obtain scans of the book. This may be done using your own scanner,
or there might be online scans available for reuse. Scans
must come from the exact same print edition as your copyright
clearance.
- Perform optical character recognition (OCR) on the scans, to make an
approximate representation of the book in plain text.
- Proofread, proofread, proofread: "Fix" the OCR output by carefully
fixing any errors it made. Remove page headers &
footers. De-hyphenate. Add back italics or other formatting.
- Format: Generate valid and well-formed HTML source. Different tools
are available for this, and usually involve editing the HTML source
code directly. Note that many tools produce convoluted, non-standard,
or non-valid HTML, which can be very difficult to clean up for Project
Gutenberg: poor HTML is not accepted, even if it is valid.
- Check, and recheck. The upload site has various tools, including to
test proper conversion to derived formats.
- Upload your work, using the copyright clearance key generated
earlier.
- Coordinate with the Project Gutenberg production volunteers (known
as "whitewashers," after the Mark Twain book) on final formatting and
presentation.
- Once the eBook is added to the Project Gutenberg collection, confirm
it is appearing correctly, and all metadata are correct.
- If possible, stay in touch into the future. If we receive errata
reports that require access to source material, or are stylistic or
subjective in nature, we might get in touch to discuss potential
changes.