From 8e941036c1d370bda96b0dcd545910e4fc2a21e7 Mon Sep 17 00:00:00 2001 From: Gutenberg back end service account Date: Thu, 27 Aug 2020 16:33:51 -0400 Subject: [PATCH] known issues updated --- robot/harvest.php | 120 +++++++++++++++++++++++++++++++++++ site/help/volunteers_faq.md | 78 +++++++++++++++++++++++ site/help/volunteers_faq.md~ | 80 +++++++++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 robot/harvest.php create mode 100644 site/help/volunteers_faq.md create mode 100644 site/help/volunteers_faq.md~ diff --git a/robot/harvest.php b/robot/harvest.php new file mode 100644 index 0000000..55f1c47 --- /dev/null +++ b/robot/harvest.php @@ -0,0 +1,120 @@ +db (); + +$fts = join (" ", $filetypes); +if (strlen ($fts)) { + $fts = ", filetypes: $fts"; +} +$ls = join (" ", $langs); +if (strlen ($ls)) { + $ls = ", languages: $ls"; +} +$caption = "All Files (offset: $offset$fts$ls)"; + +echo <<< EOT +dtd_public" "$config->dtd_system"> + + + + + $caption - Project Gutenberg + + +

$caption

+EOT; + +$where_filetypes = join ("','", $filetypes); +if (strlen ($where_filetypes)) { + $where_filetypes = " and fk_filetypes in ('$where_filetypes') "; +} +$where_languages = join ("','", $langs); +if (strlen ($where_languages)) { + $where_languages = " and fk_langs in ('$where_languages') "; +} + +$db->exec ($sql = "select pk, files.fk_books, fk_langs, filename +from files left join mn_books_langs +on mn_books_langs.fk_books = files.fk_books +where (fk_compressions = 'zip' or fk_filetypes in + ('epub.images', 'epub.noimages', 'kindle.images', 'kindle.noimages', 'mp3')) +$where_filetypes +$where_languages +and diskstatus != 5 and obsoleted = 0 +and pk > $offset order by pk limit $limit;"); + +// p ($sql); + +if ($db->FirstRow ()) { + do { + $filename = $db->get ("filename", SQLCHAR); + $offset = $db->get ("pk", SQLINT); + $fk_books = $db->get ("fk_books", SQLINT); + + /* + $dir = etext2dir ($fk_books); + if (preg_match ("!^$dir!", $filename)) { + $symlink = preg_replace ("!^$dir!", "$config->files/$fk_books/", $filename); + } elseif (strncmp ($filename, "cache/", 6) == 0) { + $symlink = "/$filename"; + } else { + $symlink = "$config->downloadbase/$filename"; + } */ + + if (strpos ($filename, '/epub/') !== false) { + // gbn: For change to aleph.gutenberg.org: + // $filename = str_replace ('/epub/', '/generated/', $filename); + $symlink = $epub_mirror . $filename; + } else { + $symlink = $mirror . $filename; + } + + p ("$symlink"); + } while ($db->NextRow ()); + + $url = "harvest?offset=$offset"; + foreach ($filetypes as $filetype) { + $url .= "&filetypes[]=$filetype"; + } + foreach ($langs as $l) { + $url .= "&langs[]=$l"; + } + + p ("Next Page"); +} else { + p ("No more files."); +} + +echo (" \n\n"); + +?> diff --git a/site/help/volunteers_faq.md b/site/help/volunteers_faq.md new file mode 100644 index 0000000..7e6c7d3 --- /dev/null +++ b/site/help/volunteers_faq.md @@ -0,0 +1,78 @@ +--- +layout: default +title: Volunteers' FAQ | Project Gutenberg +permalink: /help/volunteers_faq.html +--- + +Volunteers' FAQ +=============== + +Project Gutenberg welcomes contributions of eBooks from people with +the interest, time, and skillset needed to meet our submission +standards. Details of the process and the standards are at our +copyright clearance site [copy.pglaf.org](https://copy.pglaf.org) and upload site +[upload.pglaf.org](https://upload.pglaf.org). + +Join Distributed Proofreaders, Instead +-------------------------------------- + +For most people interested in producing eBooks, we recommend starting +with Distributed Proofreaders (https://www.pgdp.net). With +Distributed Proofreaders, you can get involved with different portions +of the production pipeline described below. This is a much easier way +to get started, and results in very high quality eBooks. + +If you simply want to suggest a book for digitization, DP has online +forums for this, or you can simply send an email (contact information +is on the site). + +Distributed Proofreaders maintains canonical guidance on production. +See especially: + +* [The Post-Processing FAQ](https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Post-Processing_FAQ) +* [Easy Epub](https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Easy_Epub (It's a guide to how best to handle the HTML that goes through epubmaker to lead to passable epubs/mobis) +* [HTML Best Practices](https://www.pgdp.org/~jana/best-practices/ (this was written a while back but DP tries to keep it up-to-date) + +Being a Solo Producer +--------------------- + +If you might be interested in producing an eBook yourself, without involving +Distributed Proofreaders, here is some guidance. But start with what's above, +including the DP links. + +In a nutshell, the production process typically involves the following: +- Identify a candidate printed book. Confirm it is not already in the +collection, or in process by other volunteers. Use the [Collection +Development Policy](/policy/collection_development.html) to guide +you on eligibility. +- Obtain a copyright clearance for the printed book. Usually this is +based on scanned title page and verso page demonstrating the printed +book was published more than 95 years ago. See the [Copyright +How-To](/help/copyright.html). +- Obtain scans of the book. This may be done using your own scanner, +or there might be online scans available for reuse. Scans +must come from the exact same print edition as your copyright +clearance. +- Perform optical character recognition (OCR) on the scans, to make an +approximate representation of the book in plain text. +- Proofread, proofread, proofread: "Fix" the OCR output by carefully +fixing any errors it made. Remove page headers & +footers. De-hyphenate. Add back italics or other formatting. +- Format: Generate valid and well-formed HTML source. Different tools +are available for this, and usually involve editing the HTML source +code directly. Note that many tools produce convoluted, non-standard, +or non-valid HTML, which can be very difficult to clean up for Project +Gutenberg: poor HTML is not accepted, even if it is valid. +- Check, and recheck. The upload site has various tools, including to +test proper conversion to derived formats. +- Upload your work, using the copyright clearance key generated +earlier. +- Coordinate with the Project Gutenberg production volunteers (known +as "whitewashers," after the Mark Twain book) on final formatting and +presentation. +- Once the eBook is added to the Project Gutenberg collection, confirm +it is appearing correctly, and all metadata are correct. +- If possible, stay in touch into the future. If we receive errata +reports that require access to source material, or are stylistic or +subjective in nature, we might get in touch to discuss potential +changes. diff --git a/site/help/volunteers_faq.md~ b/site/help/volunteers_faq.md~ new file mode 100644 index 0000000..18007bf --- /dev/null +++ b/site/help/volunteers_faq.md~ @@ -0,0 +1,80 @@ +--- +layout: default +title: Volunteers' FAQ | Project Gutenberg +permalink: /help/volunteers_faq.html +--- + +Volunteers' FAQ +=============== + +Project Gutenberg welcomes contributions of eBooks from people with +the interest, time, and skillset needed to meet our submission +standards. Details of the process and the standards are at our +copyright clearance site (https://copy.pglaf.org) and upload site +(https://upload.pglaf.org). + +Join Distributed Proofreaders, Instead +-------------------------------------- + +For most people interested in producing eBooks, we recommend starting +with Distributed Proofreaders (https://www.pgdp.net). With +Distributed Proofreaders, you can get involved with different portions +of the production pipeline described below. This is a much easier way +to get started, and results in very high quality eBooks. + +If you simply want to suggest a book for digitization, DP has online +forums for this, or you can simply send an email (contact information +is on the site). + +Distributed Proofreaders maintains canonical guidance on production. +See especially: + +* The Post-Processing FAQ -- +https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Post-Processing_FAQ +* Easy Epub -- https://www.pgdp.net/wiki/DP_Official_Documentation:PP_and_PPV/Easy_Epub (It's a guide to how best to handle the HTML that goes through epubmaker to lead to passable epubs/mobis) +* HTML Best Practices -- https://www.pgdp.org/~jana/best-practices/ (this was written a while back but DP tries to keep it up-to-date) + + +Being a Solo Producer +--------------------- + +If you might be interested in producing an eBook yourself, without involving +Distributed Proofreaders, here is some guidance. But start with what's above, +including the DP links. + +In a nutshell, the production process typically involves the following: +- Identify a candidate printed book. Confirm it is not already in the +collection, or in process by other volunteers. Use the [Collection +Development Policy](/policy/collection_development.html) to guide +you on eligibility. +- Obtain a copyright clearance for the printed book. Usually this is +based on scanned title page and verso page demonstrating the printed +book was published more than 95 years ago. See the [Copyright +How-To](/help/copyright.html). +- Obtain scans of the book. This may be done using your own scanner, +or there might be online scans available for reuse. Scans +must come from the exact same print edition as your copyright +clearance. +- Perform optical character recognition (OCR) on the scans, to make an +approximate representation of the book in plain text. +- Proofread, proofread, proofread: "Fix" the OCR output by carefully +fixing any errors it made. Remove page headers & +footers. De-hyphenate. Add back italics or other formatting. +- Format: Generate valid and well-formed HTML source. Different tools +are available for this, and usually involve editing the HTML source +code directly. Note that many tools produce convoluted, non-standard, +or non-valid HTML, which can be very difficult to clean up for Project +Gutenberg: poor HTML is not accepted, even if it is valid. +- Check, and recheck. The upload site has various tools, including to +test proper conversion to derived formats. +- Upload your work, using the copyright clearance key generated +earlier. +- Coordinate with the Project Gutenberg production volunteers (known +as "whitewashers," after the Mark Twain book) on final formatting and +presentation. +- Once the eBook is added to the Project Gutenberg collection, confirm +it is appearing correctly, and all metadata are correct. +- If possible, stay in touch into the future. If we receive errata +reports that require access to source material, or are stylistic or +subjective in nature, we might get in touch to discuss potential +changes.