From 265e9231903c118245870a54172b93beefa61d77 Mon Sep 17 00:00:00 2001 From: Thom Date: Fri, 25 Apr 2014 19:14:19 -1000 Subject: [PATCH] tesseract: updated language data Closes Homebrew/homebrew#28731. Signed-off-by: Adam Vandenberg --- Formula/tesseract.rb | 132 +++++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 62 deletions(-) diff --git a/Formula/tesseract.rb b/Formula/tesseract.rb index ee6cf929b06..958cf19a842 100644 --- a/Formula/tesseract.rb +++ b/Formula/tesseract.rb @@ -24,65 +24,83 @@ class Tesseract < Formula cause "Executable 'tesseract' segfaults on 10.6 when compiled with llvm-gcc" end + # Version 3.02 language packages. Alphabetized by language code. LANGS = { + 'afr' => 'fe8fa7f87d207024938abf6e143f6612cd6bbbc7', + 'ara' => 'e15cf6b7a027454db56ecedab0038c7739ab29cc', + 'aze' => '8e26797471e9cd943ac76cd6803e596ed3fb0625', + 'bel' => '9fc3a081e077b23d8da253f227e8a337acd9a6c8', + 'ben' => 'd1b2dec8ee537598594554768de899a9e8e361f6', + 'bul' => '34ef79f0f0f3f2b7715c343ea5357791e52336d8', + 'cat' => '1d806a2346100d93bcfc32ebf4b4c94ab006ef93', + 'ces' => '82f1797de7c06bfa9a20dbf2e4e1db03a0e94be0', + 'chi_sim' => 'edcfd4cea1b5e52a37eed605c6a2d7d5e15ef03f', + 'chi_tra' => 'f2c8bca5ffeb62e0a10d685d4f66a029472ffe52', + 'chr' => 'b9ead6114a41b395a82f1d33a276d524af81f993', + # this Danish package also contains Danish Fraktur language data + 'dan' => 'b2c4c09cde1ac5229d22eff72accb60e66b09c2b', + # this German package also contains German Fraktur language data + 'deu' => '6d21596225f9a4c36fa81b19518f2aead8c8ac79', + 'ell' => '5a5746c7a1e473c89de71a4dd46a8a888ba4ce76', 'eng' => '989ed4c3a5b246d7353893e466c353099d8b73a1', + 'enm' => '8d83830859654ffb7e1228f6fbd2604aafea987f', + 'epo' => '84ff2e071864e2766dcde842cdbe74d6bdd59549', + 'epo_alt' => 'e481f3e41c37dcf28786a90b9384321f8ca15eda', + 'equ' => '24b46c2bfe4a652b6ac7bdee9afb68f44ffc333e', + 'est' => 'd37d2c295472f6e6ded26c9d799380d205a8cec4', + 'eus' => '072b7a4fc36c8b28903e9ef651693be702f21afd', + 'fin' => 'c30b504178e5fabd8f084f59e72aa9579b4ee436', + # this French package also contains fraktur for Danish, German, Slovakian + 'fra' => '627893d878b33138608df372d191bd799b0edd4f', + 'frk' => '81b897434eee2762757519097913bd02ba527dd1', + 'frm' => 'e52f58b8244f67f07b3bc0f849247b79ab6d1bcb', + 'glg' => 'dee1605b15a57b321e7ad747df6c12a9491cfb08', + 'grc' => '2502de7df524264959c73f7253c580f491ee9a2d', 'heb' => '67e10e616caf62545eacd436e85f89436687e22b', 'hin' => '4ceef97ffb8b4ab5ac79ee4bad5b5be0885f228f', - 'ara' => 'e15cf6b7a027454db56ecedab0038c7739ab29cc', - 'tha' => '04a35c04585a887662dc668e54f5368dabf31f50' + 'hrv' => 'c281dbaca6c1fa39ab1e7a82b43542fa46a9020b', + 'hun' => '4d7b4534d03c1a6a3861dac0f66da13fd9601e62', + 'ind' => '364fe00da0be5bd47661a68acf10ac0bc928cdaf', + 'isl' => '7036d949d1860f192d4e038597c435d47fe8ca9a', + # this Italian package also contains Old Italian language data + 'ita' => 'b4d75234cab050db69874190d1bab5b2b1c59961', + 'jpn' => '7212a708ef688687538abe9a40aab99aa06017a2', + 'kan' => 'd3a71631227b04dbcd370b1fdee6d423655663b7', + 'kor' => 'd39e165fb73e339c21fbdf995b9d0fffda9fd7db', + 'lav' => '98fccc5c8b1613364d5386b29c9dfa82d178d772', + 'lit' => '1ef0fb76f9b463aa8a6e41056c87842d43102698', + 'mal' => 'aadbded7e50f7dd503d62f99060836c9f4d65732', + 'mkd' => 'f6a5235ad131dde816f3f151df4f999f4ea214c6', + 'mlt' => '72d4ee59e6fd79f618a7ef09bdedb5e887d20db4', + 'msa' => '6344682dc9fe541499f455ae88c07f5941d5e646', + 'nld' => 'e68e491d68fa367e2d173b12f19330e3ed72e750', + 'nor' => 'cc0f69a4df82adb0f8591f1dfbe3920ced46181b', + 'pol' => 'b6c1092ad1bdba5dc019d8f84d3e6c3478e4ffb0', + 'por' => '919e6503bd7fc00ffeb27b0823ddad3d2aaf7ba9', + 'ron' => '159afd5da08a58ef2cc49be0166068d70c297384', + 'rus' => '5caa3f1c5d46642e7c0c17ecd7cc4fe2a4aa2b0b', + # this Slovakian package also contains Slovakian Fraktur language data + 'slk' => '946d6033b276f7e3fcfceca35978d3380d10710f', + 'slv' => 'cbd9506944069954e3742cbcac2f15ca7f7c90cc', + # this Spanish package also contains Old Spanish language data + 'spa' => '51e1289f0320bd750be4b04065dcae6862562b01', + 'sqi' => '4d865da27ec2b0ce5f5dbc67a7a1781af1eca05c', + 'srp' => 'ca496054260efb361f6873ba7212dd7438fd496c', + 'swa' => '453ca6cd1b0b154006eef2046fb008796f59ce8a', + 'swe' => '8ecabd4d010ff89d6d3975fb09fc622bf1c157bb', + 'tam' => 'b1067548993f168b06019616fc1a1d515169ee84', + 'tel' => '726a411daa5ba3c5a6e3e04e870591ba65bf3ee8', + 'tgl' => '5437a1a2ffc3dceea2e241b41bd414e64148274d', + 'tha' => '04a35c04585a887662dc668e54f5368dabf31f50', + 'tur' => '017c11a1630ad96f600a863803a892aeadaa34fb', + 'ukr' => '729a2bd1a2cf81c314b8b0f2057260019f056149', + 'vie' => '9ab987ff577f1e1c536000d97cf6247f44ef8fdd' } LANGS.each do |name, sha| resource name do url "https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.#{name}.tar.gz" - sha1 sha - end - end - - LANGS_OLD = { - 'chr' => 'e49b17bb73911926050d45832171a54ab1d1f34c', - 'deu-frak' => '5651562e0d944b5b89cc5977d71482089f12669f', - 'swe-frak' => '22220ad4303ebe290e4e71170e96b488e81a7f1a', - 'chi_tra' => 'a9798de7e068d85613602aa33a153da721aadc82', - 'chi_sim' => '35f0254f159edeed509ec1e0779073bf998b6cdb', - 'ind' => 'f4214ce40c5f6ef92085a8a45e9ff03f7cf7afca', - 'swe' => '55291e8ea664155ad51db867284c11ad1a1c5d00', - 'ron' => 'c20c73a2e17f5fe692de0fe9ac681da3984229ae', - 'slv' => 'fbe464cd49d6a7495e6d95600d421aa2dd0b9d77', - 'srp' => '47afc601b62998e4cc3f7403d846ba861f30b416', - 'tgl' => '153ba1d0ddd209e1581d81d42fe5346f748e2f27', - 'tur' => 'a01da62f3830833b258e2d46ce0f2852571470e6', - 'hun' => '32ecad03877a841fbc0cb31c269214640008d604', - 'fin' => '004d74d13f7b53cbefb86e2ba12bc67dce81d936', - 'ita' => 'c166ba79256f6e7c1b993b2db7403d794131fe05', - 'nld' => 'f7e3d46b1747a19158ac0797e859b65c56b5045f', - 'nor' => 'fb65dede5fbe120823ecdcb0c6cbd1222ae7e245', - 'jpn' => '6d605eee29e76fb841924916bd34095bbbbc45c0', - 'vie' => '9158748a63afe87e4e25b5f32c222555f2ad8417', - 'spa' => '7b30950749e84891fdef5f89409c3cf1b6418cd3', - 'ukr' => '06ceebfd91fa473d6d91f8a2856c66733bea0131', - 'fra' => '8d698bb3b659e862b3274970a57b3214de76f1ff', - 'slk' => '16207e26d53504f98a7b1fadcb873dc4611149ec', - 'kor' => '37bcd8110a426714f54d99f58b30104b3014ce5a', - 'ell' => 'b7a449fc320cc579a729c0352e5cc642f565e64e', - 'rus' => '2740accefc45e4ae004269ccb195948b8037a583', - 'por' => '883e5e1fa1d991ef6d202951ee9d26a71db181dc', - 'bul' => 'a9efae5e347a36ea90bd2ad357e732ad4da47fd3', - 'lav' => 'b4efd308e725d743884f2984f804c82dd5382f63', - 'lit' => '7adbe396a281c0f87c0b95da7e84b5b6029e3dbd', - 'pol' => 'a303fc31b4b60532b01b4ccdc838f02ff0113f27', - 'dan-frak' => 'c0eba6d3ca688a04fd8e3ce45fdbbf20e8e67d45', - 'deu' => 'c4b3ecde18ce9f114faba88cdfd0308f90801266', - 'dan' => 'bfac9c00d28fc4b19034c2098d41087a173084ae', - 'ces' => 'dbec19aa23f42a08e6b195a96e64b443f7519620', - 'cat' => '0301a9c81c1d646bd1b135ca89476fb63bd634f8', - } - - # pre-3.01 language data uses a different URL format and installs differently - LANGS_OLD.each do |lang, sha| - resource lang do - url "https://tesseract-ocr.googlecode.com/files/#{lang}.traineddata.gz" - version "3.00" + version "3.02" # otherwise "ocr" incorrectly detected as the version sha1 sha end end @@ -96,19 +114,9 @@ class Tesseract < Formula system "./configure", "--disable-dependency-tracking", "--prefix=#{prefix}" system "make install" if build.include? "all-languages" - install_language_data + resources.each { |r| r.stage { mv Dir["tessdata/*"], share/"tessdata" } } else - resource('eng').stage { mv Dir['tessdata/*'], "#{share}/tessdata/" } - end - end - - def install_language_data - LANGS.each_key do |lang| - resource(lang).stage { mv Dir["tessdata/*"], "#{share}/tessdata/" } - end - - LANGS_OLD.each_key do |lang| - resource(lang).stage { mv Dir["*"], "#{share}/tessdata/" } + resource("eng").stage { mv Dir["tessdata/*"], share/"tessdata" } end end end