From cdf46591462b67fbae6f3057ab485f039ad966b8 Mon Sep 17 00:00:00 2001 From: j-sofia Date: Wed, 16 Nov 2022 17:22:34 -0500 Subject: [PATCH] OAP-37: Read stopwords from txt (#23) * read stopwords from txt read stopwords from txt and README change * leftover code removed * formatting * formatting again * formatting last try --- oapen-engine/README.md | 6 + oapen-engine/src/model/ngrams.py | 17 +- oapen-engine/src/model/stopwords.py | 435 ------------------ oapen-engine/src/model/stopwords_broken.txt | 159 +++++++ oapen-engine/src/model/stopwords_dutch.txt | 428 +++++++++++++++++ oapen-engine/src/model/stopwords_filter.txt | 19 + .../src/model/stopwords_publisher.txt | 244 ++++++++++ 7 files changed, 869 insertions(+), 439 deletions(-) delete mode 100644 oapen-engine/src/model/stopwords.py create mode 100644 oapen-engine/src/model/stopwords_broken.txt create mode 100644 oapen-engine/src/model/stopwords_dutch.txt create mode 100644 oapen-engine/src/model/stopwords_filter.txt create mode 100644 oapen-engine/src/model/stopwords_publisher.txt diff --git a/oapen-engine/README.md b/oapen-engine/README.md index cf2e92e..55ef612 100644 --- a/oapen-engine/README.md +++ b/oapen-engine/README.md @@ -28,3 +28,9 @@ While the virtual environment is running, type: ``` deactivate ``` +## How to remove/filter out bad ngrams +Members of EbookFoundation can create a pull request to edit the stopwords used to filter out bad trigrams: +``` +oapen-engine/src/model/stopwords_*.txt +``` +This also can be done to remove a malformed trigram already in the database (during the next run) \ No newline at end of file diff --git a/oapen-engine/src/model/ngrams.py b/oapen-engine/src/model/ngrams.py index 458535d..df3b438 100644 --- a/oapen-engine/src/model/ngrams.py +++ b/oapen-engine/src/model/ngrams.py @@ -1,8 +1,8 @@ +import os import string from typing import List import data.oapen_db as OapenDB -import model.stopwords as oapen_stopwords # pylint: disable=import-error import nltk # pylint: disable=import-error import pandas as pd # pylint: disable=import-error from nltk import word_tokenize # pylint: disable=import-error @@ -14,15 +14,24 @@ from .oapen_types import ( # pylint: disable=relative-beyond-top-level OapenNgram, ) +stopword_paths = [ + "model/stopwords_broken.txt", + "model/stopwords_dutch.txt", + "model/stopwords_filter.txt", + "model/stopwords_publisher.txt", +] + +for p in stopword_paths: + with open(p, "r") as f: + oapen_stopwords += [line.rstrip() for line in f] + nltk.download("stopwords") STOPWORDS = ( stopwords.words("english") + stopwords.words("german") + stopwords.words("dutch") - + oapen_stopwords.stopwords_dutch_extra - + oapen_stopwords.stopwords_filter - + oapen_stopwords.stopwords_publisher + + oapen_stopwords ) diff --git a/oapen-engine/src/model/stopwords.py b/oapen-engine/src/model/stopwords.py deleted file mode 100644 index 6701147..0000000 --- a/oapen-engine/src/model/stopwords.py +++ /dev/null @@ -1,435 +0,0 @@ -stopwords_filter = [ - "4.0 deed.de", - "commons attribution", - "commons lizenz", - "commons namensnennung", - "creative commons", - "creativecommons", - "creativecommons.org", - "doi 10", - "doi.org", - "hrsg", - "isbn 13 978", - "isbn 3", - "isbn 978", - "noderivatives", - "noderivs", - "noncommercial", - "orcid.org", - "springer cham", - "university press", -] - -# A list of stop words, connected to either open licenses or publishers -stopwords_publisher = [ - "2010 iwa publishing", - "2015 http www", - "4.0 deed.de veröffentli", - "4.0 international license", - "4.0 international lizenz", - "academic studies press", - "access book chapter", - "access chapter distributed", - "adapted material derived", - "akademie verlag berlin", - "albany suny press", - "amer math soc", - "american economic review", - "amsterdam amsterdam university", - "amsterdam john benjamins", - "amsterdam philadelphia john", - "angeben ob änderungen", - "aosis cape town", - "attribution 4.0 international", - "attribution licence cc", - "attribution noncommercial noderivatives", - "attribution noncommercial noderivs", - "auckland auckland university", - "baden baden nomos", - "baltimore johns hopkins", - "baltimore md johns", - "basingstoke palgrave macmillan", - "basingstoke uk palgrave", - "benjamins publishing company", - "berlin akademie verlag", - "berlin heidelberg springer", - "berlin language science", - "berlin springer verlag", - "bielefeld transcript verlag", - "bloomington indiana university", - "boom juridische uitgevers", - "bridge cambridge university", - "brill nv leiden", - "british medical journal", - "brunswick nj rutgers", - "böhlau verlag ges.m.b.h", - "böhlau verlag gmbh", - "ca csli publications", - "ca stanford university", - "cambridge cambridge university", - "cambridge eng cambridge", - "cambridge harvard university", - "cambridge ma harvard", - "cambridge ma london", - "cambridge mass harvard", - "cambridge polity press", - "cambridge uk cambridge", - "clarendon press oxford", - "co.kg wien köln", - "commons attribution noncommercial", - "commons lizenz sofern", - "ct yale university", - "delhi oxford university", - "doi 63 9789004", - "doi https doi", - "drittmaterial unterliegen ebenfalls", - "durham duke university", - "durham n.c duke", - "durham nc duke", - "earth syst sci", - "edinburgh edinburgh university", - "eds tacas 2020", - "evanston il northwestern", - "evanston ill northwestern", - "format erlaubt sofern", - "frankfurt a.m campus", - "frankfurt a.m fischer", - "frankfurt a.m suhr", - "frankfurt a.m suhrkamp", - "frankfurt main suhrkamp", - "franz steiner verlag", - "gent academia press", - "gesetzlichen vorschriften erlaubt", - "gmbh co.kg wien", - "grand rapids mi", - "gravenhage sdu uitgeverij", - "groningen wolters noordhoff", - "göttingen vandenhoeck ruprecht", - "h.d tjeenk willink", - "haag boom juridische", - "haag boom lemma", - "haag sdu uitgeverij", - "haag sdu uitgevers", - "harvard business review", - "haven ct yale", - "haven yale university", - "hong princeton n.j", - "html letzter zugriff", - "html zuletzt abgerufen", - "html zuletzt geprüft", - "http csli publications.stanford.edu", - "http www.youtube.com watch", - "https en.wikipedia.org wiki", - "https www.youtube.com watch", - "intentionally left blank", - "international license http", - "international lizenz http", - "ithaca cornell university", - "ithaca ny cornell", - "iwa publishing london", - "jeweiligen rechteinhabers einzuholen", - "john benjamins doi", - "john benjamins publishing", - "john wiley sons", - "johns hopkins university", - "kapitel enthaltenen bilder", - "kluwer academic publishers", - "kluwer law international", - "koninklijke brill nv", - "köln weimar wien", - "language science press", - "leiden boston brill", - "leiden e.j brill", - "leiden kitlv press", - "leiden stenfert kroese", - "london duke university", - "london i.b tauris", - "london john murray", - "london oxford university", - "london penguin books", - "london pluto press", - "london routledge kegan", - "london routledge pp", - "london sage publications", - "london ubiquity press", - "london ucl press", - "london zed books", - "ma harvard university", - "main peter lang", - "manchester manchester university", - "mass harvard university", - "medizinisch wissenschaftliche verlagsgesellschaft", - "mohr paul siebeck", - "mohr siebeck tübingen", - "münchen wilhelm fink", - "n.j princeton university", - "namensnennung 4.0 international", - "national lizenz http", - "nc 4.0 license", - "nc duke university", - "nc sa 4.0", - "nj prentice hall", - "nj prince ton", - "nj princeton university", - "nj rutgers university", - "noderivatives 4.0 license", - "noncommercial noderivatives 4.0", - "north carolina press", - "notre dame press", - "nutzung vervielfältigung bearbeitung", - "ny cornell university", - "ny orbis books", - "ob änderungen vorgenommen", - "obtain permission directly", - "online url http", - "opladen leske budrich", - "opladen westdeutscher verlag", - "opladen westdt verlag", - "otto cramwinckel uitgever", - "oxford clarendon press", - "oxford oxford u.p", - "oxford oxford university", - "p.i.e peter lang", - "page intentionally left", - "paris presses universitaires", - "pdf letzter zugriff", - "pdf zuletzt aufgerufen", - "pdf zuletzt eingesehen", - "pdf zuletzt geprüft", - "peter lang frankfurt", - "phd diss university", - "phd thesis university", - "philadelphia john benjamins", - "press 2019 pp", - "press cambridge ma", - "press cambridge mass", - "press cambridge pp", - "press doi https", - "press oxford pp", - "press washington d.c", - "princeton n.j princeton", - "princeton nj princeton", - "princeton princeton university", - "publications http csli", - "quelle eigene berechnungen", - "quelle eigene darstellung", - "quelle ordnungsgemäß nennen", - "samsom uitgeverij alphen", - "science press doi", - "share adapted material", - "sharing adaptation distribution", - "siehe dazu u.a", - "siehe reg nr", - "sites default files", - "sonstiges drittmaterial unterliegen", - "springer berlin heidelberg", - "st martin’s press", - "stanford ca csli", - "stanford ca stanford", - "stanford calif stanford", - "stanford stanford university", - "sydney sydney university", - "toronto barbara budrich", - "tübingen mohr siebeck", - "ubiquity press doi", - "uitgeverij bert bakker", - "uk cambridge university", - "uk palgrave macmillan", - "unpublished phd dissertation", - "url http bit.ly", - "verlag barbara budrich", - "verlag gmbh co.kg", - "vervielfältigung bearbeitung verbreitung", - "vis à vis", - "westminster press doi", - "wien köln weimar", - "wissenschaftliche verlagsgesellschaft berlin", - "york academic press", - "york basic books", - "york berghahn books", - "york cambridge university", - "york columbia university", - "york fordham university", - "york free press", - "york grove press", - "york mcgraw hill", - "york ny oxford", - "york ny routledge", - "york oxford university", - "york palgrave macmillan", - "york penguin books", - "york peter lang", - "york random house", - "york schocken books", - "york st martin’s", - "york vintage international", - "york zed books", - "zwolle w.e.j tjeenk", - "à la fois", -] - -# A list of 'broken' words, not to be used as trigrams -stopwords_broken = [ - "ac tuele bestaansonzekerheid", - "bedeutungsexpl ikat ion", - "bedeutungsexpl ikat ionen", - "bes ted ingen", - "biopo liti cal", - "bl att mitt", - "box offi ce", - "ca rib bean", - "ca ribb ean", - "capital fl ows", - "car ib bean", - "car ibb ean", - "chris tian ity", - "chris tiani ty", - "christ ian ity", - "christ iani ty", - "connec ted curr", - "con tac ten", - "consumptieve bes ted", - "cont ro le", - "dif ere nt", - "dif fer ent", - "dif fere nt", - "diff ere nt", - "diff erent kinds", - "direct eff ect", - "doe le inden", - "empir ische untersuchung", - "esp rä ch", - "est äti gt", - "eu ro pe", - "eu ro pean", - "eu rop ean", - "evo lu tion", - "fe mi nist", - "fi ction publishers", - "fi ction publishing", - "fi lm noir", - "fi lm theory", - "fi losofi sche", - "fi nal section", - "fi nancial crisis", - "fi nancial markets", - "fi rm level", - "fi rst book", - "fi rst century", - "fi rst chapter", - "fi rst generation", - "fi rst muve", - "fi rst person", - "fi rst principle", - "fi rst time", - "fi ve yearly", - "fl ow cell", - "fl ow direction", - "fl ow regime", - "fl ow velocities", - "fo ku ss", - "fo lg lic", - "fo lk lo", - "fo lk poetry", - "fo rm ation", - "fo rm ationen", - "foreign offi ce", - "ga ni za", - "ge legenhe id", - "geopo liti cal", - "geopol iti cal", - "government offi cials", - "gr aph ic", - "hu id ige", - "ia le zaken", - "ial med ia", - "ic ul ar", - "idsbe le id", - "iebe le id", - "iesa menlev ing", - "ind ica toren", - "ind iv iduen", - "inf luence attempts", - "ingsbe le id", - "inso fe rn", - "inves ter ingen", - "ir ish women", - "ite ra tu", - "ite ra tur", - "jens mar tin", - "knowledge pol icy", - "le id ig", - "le id ing", - "leading cit ies", - "lex ibe le", - "lie ß lic", - "lit era ture", - "lite ra ture", - "lm ä ß", - "local fi rms", - "maat rege len", - "mar tin gurr", - "mitt el alt", - "mo ve ment", - "nat iona le", - "ni za tion", - "nst verle ning", - "offi cial nationality", - "onde rzoek ingen", - "pa rt ii", - "pa rt iii", - "par tic ul", - "par ticu lar", - "perf orm ance", - "po liti cal", - "pol iti cal", - "pos si ble", - "pos sib le", - "poss ib le", - "publ ic archaeology", - "pulp fi ction", - "repre sen ta", - "ro pean union", - "rzäh le rs", - "samen lev ing", - "science fi ction", - "scientif ic research", - "sen ta tion", - "sen ta tions", - "sl agva ardigheid", - "soc ia le", - "soc ial med", - "sp ra ch", - "sp rach lich", - "sp rach liche", - "sp rach lichen", - "ssp ra ch", - "st err ei", - "subs id ies", - "ted curr iculum", - "tele vi sion", - "tic ul ar", - "tive commons license", - "tu ra lis", - "tu ra lly", - "twenty fi rst", - "twenty fi ve", - "ty lis tic", - "uoo rbee ld", - "ve rhoud ing", - "ve rö ffe", - "veren ig ing", - "vi sion dramas", - "voorz ien ingen", - "werkge legenhe id", - "wi jz ig", - "zusä tz lich", - "ä sthe tik", - "ä tz lic", - "ä ß ig", - "ö st err", - "öst err ur", -] - -# A list of Dutch stop words, not part of stopwords_nl -stopwords_dutch_extra = ["een", "te", "ten"] diff --git a/oapen-engine/src/model/stopwords_broken.txt b/oapen-engine/src/model/stopwords_broken.txt new file mode 100644 index 0000000..d336e00 --- /dev/null +++ b/oapen-engine/src/model/stopwords_broken.txt @@ -0,0 +1,159 @@ +ac tuele bestaansonzekerheid +bedeutungsexpl ikat ion +bedeutungsexpl ikat ionen +bes ted ingen +biopo liti cal +bl att mitt +box offi ce +ca rib bean +ca ribb ean +capital fl ows +car ib bean +car ibb ean +chris tian ity +chris tiani ty +christ ian ity +christ iani ty +connec ted curr +con tac ten +consumptieve bes ted +cont ro le +dif ere nt +dif fer ent +dif fere nt +diff ere nt +diff erent kinds +direct eff ect +doe le inden +empir ische untersuchung +esp rä ch +est äti gt +eu ro pe +eu ro pean +eu rop ean +evo lu tion +fe mi nist +fi ction publishers +fi ction publishing +fi lm noir +fi lm theory +fi losofi sche +fi nal section +fi nancial crisis +fi nancial markets +fi rm level +fi rst book +fi rst century +fi rst chapter +fi rst generation +fi rst muve +fi rst person +fi rst principle +fi rst time +fi ve yearly +fl ow cell +fl ow direction +fl ow regime +fl ow velocities +fo ku ss +fo lg lic +fo lk lo +fo lk poetry +fo rm ation +fo rm ationen +foreign offi ce +ga ni za +ge legenhe id +geopo liti cal +geopol iti cal +government offi cials +gr aph ic +hu id ige +ia le zaken +ial med ia +ic ul ar +idsbe le id +iebe le id +iesa menlev ing +ind ica toren +ind iv iduen +inf luence attempts +ingsbe le id +inso fe rn +inves ter ingen +ir ish women +ite ra tu +ite ra tur +jens mar tin +knowledge pol icy +le id ig +le id ing +leading cit ies +lex ibe le +lie ß lic +lit era ture +lite ra ture +lm ä ß +local fi rms +maat rege len +mar tin gurr +mitt el alt +mo ve ment +nat iona le +ni za tion +nst verle ning +offi cial nationality +onde rzoek ingen +pa rt ii +pa rt iii +par tic ul +par ticu lar +perf orm ance +po liti cal +pol iti cal +pos si ble +pos sib le +poss ib le +publ ic archaeology +pulp fi ction +repre sen ta +ro pean union +rzäh le rs +samen lev ing +science fi ction +scientif ic research +sen ta tion +sen ta tions +sl agva ardigheid +soc ia le +soc ial med +sp ra ch +sp rach lich +sp rach liche +sp rach lichen +ssp ra ch +st err ei +subs id ies +ted curr iculum +tele vi sion +tic ul ar +tive commons license +tu ra lis +tu ra lly +twenty fi rst +twenty fi ve +ty lis tic +uoo rbee ld +ve rhoud ing +ve rö ffe +veren ig ing +vi sion dramas +voorz ien ingen +werkge legenhe id +wi jz ig +zusä tz lich +ä sthe tik +ä tz lic +ä ß ig +ö st err +öst err ur \ No newline at end of file diff --git a/oapen-engine/src/model/stopwords_dutch.txt b/oapen-engine/src/model/stopwords_dutch.txt new file mode 100644 index 0000000..41f55e9 --- /dev/null +++ b/oapen-engine/src/model/stopwords_dutch.txt @@ -0,0 +1,428 @@ +4.0 deed.de +commons attribution +commons lizenz +commons namensnennung +creative commons +creativecommons +creativecommons.org +doi 10 +doi.org +hrsg +isbn 13 978 +isbn 3 +isbn 978 +noderivatives +noderivs +noncommercial +orcid.org +springer cham +university press + +2010 iwa publishing +2015 http www +4.0 deed.de veröffentli +4.0 international license +4.0 international lizenz +academic studies press +access book chapter +access chapter distributed +adapted material derived +akademie verlag berlin +albany suny press +amer math soc +american economic review +amsterdam amsterdam university +amsterdam john benjamins +amsterdam philadelphia john +angeben ob änderungen +aosis cape town +attribution 4.0 international +attribution licence cc +attribution noncommercial noderivatives +attribution noncommercial noderivs +auckland auckland university +baden baden nomos +baltimore johns hopkins +baltimore md johns +basingstoke palgrave macmillan +basingstoke uk palgrave +benjamins publishing company +berlin akademie verlag +berlin heidelberg springer +berlin language science +berlin springer verlag +bielefeld transcript verlag +bloomington indiana university +boom juridische uitgevers +bridge cambridge university +brill nv leiden +british medical journal +brunswick nj rutgers +böhlau verlag ges.m.b.h +böhlau verlag gmbh +ca csli publications +ca stanford university +cambridge cambridge university +cambridge eng cambridge +cambridge harvard university +cambridge ma harvard +cambridge ma london +cambridge mass harvard +cambridge polity press +cambridge uk cambridge +clarendon press oxford +co.kg wien köln +commons attribution noncommercial +commons lizenz sofern +ct yale university +delhi oxford university +doi 63 9789004 +doi https doi +drittmaterial unterliegen ebenfalls +durham duke university +durham n.c duke +durham nc duke +earth syst sci +edinburgh edinburgh university +eds tacas 2020 +evanston il northwestern +evanston ill northwestern +format erlaubt sofern +frankfurt a.m campus +frankfurt a.m fischer +frankfurt a.m suhr +frankfurt a.m suhrkamp +frankfurt main suhrkamp +franz steiner verlag +gent academia press +gesetzlichen vorschriften erlaubt +gmbh co.kg wien +grand rapids mi +gravenhage sdu uitgeverij +groningen wolters noordhoff +göttingen vandenhoeck ruprecht +h.d tjeenk willink +haag boom juridische +haag boom lemma +haag sdu uitgeverij +haag sdu uitgevers +harvard business review +haven ct yale +haven yale university +hong princeton n.j +html letzter zugriff +html zuletzt abgerufen +html zuletzt geprüft +http csli publications.stanford.edu +http www.youtube.com watch +https en.wikipedia.org wiki +https www.youtube.com watch +intentionally left blank +international license http +international lizenz http +ithaca cornell university +ithaca ny cornell +iwa publishing london +jeweiligen rechteinhabers einzuholen +john benjamins doi +john benjamins publishing +john wiley sons +johns hopkins university +kapitel enthaltenen bilder +kluwer academic publishers +kluwer law international +koninklijke brill nv +köln weimar wien +language science press +leiden boston brill +leiden e.j brill +leiden kitlv press +leiden stenfert kroese +london duke university +london i.b tauris +london john murray +london oxford university +london penguin books +london pluto press +london routledge kegan +london routledge pp +london sage publications +london ubiquity press +london ucl press +london zed books +ma harvard university +main peter lang +manchester manchester university +mass harvard university +medizinisch wissenschaftliche verlagsgesellschaft +mohr paul siebeck +mohr siebeck tübingen +münchen wilhelm fink +n.j princeton university +namensnennung 4.0 international +national lizenz http +nc 4.0 license +nc duke university +nc sa 4.0 +nj prentice hall +nj prince ton +nj princeton university +nj rutgers university +noderivatives 4.0 license +noncommercial noderivatives 4.0 +north carolina press +notre dame press +nutzung vervielfältigung bearbeitung +ny cornell university +ny orbis books +ob änderungen vorgenommen +obtain permission directly +online url http +opladen leske budrich +opladen westdeutscher verlag +opladen westdt verlag +otto cramwinckel uitgever +oxford clarendon press +oxford oxford u.p +oxford oxford university +p.i.e peter lang +page intentionally left +paris presses universitaires +pdf letzter zugriff +pdf zuletzt aufgerufen +pdf zuletzt eingesehen +pdf zuletzt geprüft +peter lang frankfurt +phd diss university +phd thesis university +philadelphia john benjamins +press 2019 pp +press cambridge ma +press cambridge mass +press cambridge pp +press doi https +press oxford pp +press washington d.c +princeton n.j princeton +princeton nj princeton +princeton princeton university +publications http csli +quelle eigene berechnungen +quelle eigene darstellung +quelle ordnungsgemäß nennen +samsom uitgeverij alphen +science press doi +share adapted material +sharing adaptation distribution +siehe dazu u.a +siehe reg nr +sites default files +sonstiges drittmaterial unterliegen +springer berlin heidelberg +st martin’s press +stanford ca csli +stanford ca stanford +stanford calif stanford +stanford stanford university +sydney sydney university +toronto barbara budrich +tübingen mohr siebeck +ubiquity press doi +uitgeverij bert bakker +uk cambridge university +uk palgrave macmillan +unpublished phd dissertation +url http bit.ly +verlag barbara budrich +verlag gmbh co.kg +vervielfältigung bearbeitung verbreitung +vis à vis +westminster press doi +wien köln weimar +wissenschaftliche verlagsgesellschaft berlin +york academic press +york basic books +york berghahn books +york cambridge university +york columbia university +york fordham university +york free press +york grove press +york mcgraw hill +york ny oxford +york ny routledge +york oxford university +york palgrave macmillan +york penguin books +york peter lang +york random house +york schocken books +york st martin’s +york vintage international +york zed books +zwolle w.e.j tjeenk +à la fois + +ac tuele bestaansonzekerheid +bedeutungsexpl ikat ion +bedeutungsexpl ikat ionen +bes ted ingen +biopo liti cal +bl att mitt +box offi ce +ca rib bean +ca ribb ean +capital fl ows +car ib bean +car ibb ean +chris tian ity +chris tiani ty +christ ian ity +christ iani ty +connec ted curr +con tac ten +consumptieve bes ted +cont ro le +dif ere nt +dif fer ent +dif fere nt +diff ere nt +diff erent kinds +direct eff ect +doe le inden +empir ische untersuchung +esp rä ch +est äti gt +eu ro pe +eu ro pean +eu rop ean +evo lu tion +fe mi nist +fi ction publishers +fi ction publishing +fi lm noir +fi lm theory +fi losofi sche +fi nal section +fi nancial crisis +fi nancial markets +fi rm level +fi rst book +fi rst century +fi rst chapter +fi rst generation +fi rst muve +fi rst person +fi rst principle +fi rst time +fi ve yearly +fl ow cell +fl ow direction +fl ow regime +fl ow velocities +fo ku ss +fo lg lic +fo lk lo +fo lk poetry +fo rm ation +fo rm ationen +foreign offi ce +ga ni za +ge legenhe id +geopo liti cal +geopol iti cal +government offi cials +gr aph ic +hu id ige +ia le zaken +ial med ia +ic ul ar +idsbe le id +iebe le id +iesa menlev ing +ind ica toren +ind iv iduen +inf luence attempts +ingsbe le id +inso fe rn +inves ter ingen +ir ish women +ite ra tu +ite ra tur +jens mar tin +knowledge pol icy +le id ig +le id ing +leading cit ies +lex ibe le +lie ß lic +lit era ture +lite ra ture +lm ä ß +local fi rms +maat rege len +mar tin gurr +mitt el alt +mo ve ment +nat iona le +ni za tion +nst verle ning +offi cial nationality +onde rzoek ingen +pa rt ii +pa rt iii +par tic ul +par ticu lar +perf orm ance +po liti cal +pol iti cal +pos si ble +pos sib le +poss ib le +publ ic archaeology +pulp fi ction +repre sen ta +ro pean union +rzäh le rs +samen lev ing +science fi ction +scientif ic research +sen ta tion +sen ta tions +sl agva ardigheid +soc ia le +soc ial med +sp ra ch +sp rach lich +sp rach liche +sp rach lichen +ssp ra ch +st err ei +subs id ies +ted curr iculum +tele vi sion +tic ul ar +tive commons license +tu ra lis +tu ra lly +twenty fi rst +twenty fi ve +ty lis tic +uoo rbee ld +ve rhoud ing +ve rö ffe +veren ig ing +vi sion dramas +voorz ien ingen +werkge legenhe id +wi jz ig +zusä tz lich +ä sthe tik +ä tz lic +ä ß ig +ö st err +öst err ur + +een +te +ten diff --git a/oapen-engine/src/model/stopwords_filter.txt b/oapen-engine/src/model/stopwords_filter.txt new file mode 100644 index 0000000..53a10da --- /dev/null +++ b/oapen-engine/src/model/stopwords_filter.txt @@ -0,0 +1,19 @@ +4.0 deed.de +commons attribution +commons lizenz +commons namensnennung +creative commons +creativecommons +creativecommons.org +doi 10 +doi.org +hrsg +isbn 13 978 +isbn 3 +isbn 978 +noderivatives +noderivs +noncommercial +orcid.org +springer cham +university press \ No newline at end of file diff --git a/oapen-engine/src/model/stopwords_publisher.txt b/oapen-engine/src/model/stopwords_publisher.txt new file mode 100644 index 0000000..6d91655 --- /dev/null +++ b/oapen-engine/src/model/stopwords_publisher.txt @@ -0,0 +1,244 @@ +2010 iwa publishing +2015 http www +4.0 deed.de veröffentli +4.0 international license +4.0 international lizenz +academic studies press +access book chapter +access chapter distributed +adapted material derived +akademie verlag berlin +albany suny press +amer math soc +american economic review +amsterdam amsterdam university +amsterdam john benjamins +amsterdam philadelphia john +angeben ob änderungen +aosis cape town +attribution 4.0 international +attribution licence cc +attribution noncommercial noderivatives +attribution noncommercial noderivs +auckland auckland university +baden baden nomos +baltimore johns hopkins +baltimore md johns +basingstoke palgrave macmillan +basingstoke uk palgrave +benjamins publishing company +berlin akademie verlag +berlin heidelberg springer +berlin language science +berlin springer verlag +bielefeld transcript verlag +bloomington indiana university +boom juridische uitgevers +bridge cambridge university +brill nv leiden +british medical journal +brunswick nj rutgers +böhlau verlag ges.m.b.h +böhlau verlag gmbh +ca csli publications +ca stanford university +cambridge cambridge university +cambridge eng cambridge +cambridge harvard university +cambridge ma harvard +cambridge ma london +cambridge mass harvard +cambridge polity press +cambridge uk cambridge +clarendon press oxford +co.kg wien köln +commons attribution noncommercial +commons lizenz sofern +ct yale university +delhi oxford university +doi 63 9789004 +doi https doi +drittmaterial unterliegen ebenfalls +durham duke university +durham n.c duke +durham nc duke +earth syst sci +edinburgh edinburgh university +eds tacas 2020 +evanston il northwestern +evanston ill northwestern +format erlaubt sofern +frankfurt a.m campus +frankfurt a.m fischer +frankfurt a.m suhr +frankfurt a.m suhrkamp +frankfurt main suhrkamp +franz steiner verlag +gent academia press +gesetzlichen vorschriften erlaubt +gmbh co.kg wien +grand rapids mi +gravenhage sdu uitgeverij +groningen wolters noordhoff +göttingen vandenhoeck ruprecht +h.d tjeenk willink +haag boom juridische +haag boom lemma +haag sdu uitgeverij +haag sdu uitgevers +harvard business review +haven ct yale +haven yale university +hong princeton n.j +html letzter zugriff +html zuletzt abgerufen +html zuletzt geprüft +http csli publications.stanford.edu +http www.youtube.com watch +https en.wikipedia.org wiki +https www.youtube.com watch +intentionally left blank +international license http +international lizenz http +ithaca cornell university +ithaca ny cornell +iwa publishing london +jeweiligen rechteinhabers einzuholen +john benjamins doi +john benjamins publishing +john wiley sons +johns hopkins university +kapitel enthaltenen bilder +kluwer academic publishers +kluwer law international +koninklijke brill nv +köln weimar wien +language science press +leiden boston brill +leiden e.j brill +leiden kitlv press +leiden stenfert kroese +london duke university +london i.b tauris +london john murray +london oxford university +london penguin books +london pluto press +london routledge kegan +london routledge pp +london sage publications +london ubiquity press +london ucl press +london zed books +ma harvard university +main peter lang +manchester manchester university +mass harvard university +medizinisch wissenschaftliche verlagsgesellschaft +mohr paul siebeck +mohr siebeck tübingen +münchen wilhelm fink +n.j princeton university +namensnennung 4.0 international +national lizenz http +nc 4.0 license +nc duke university +nc sa 4.0 +nj prentice hall +nj prince ton +nj princeton university +nj rutgers university +noderivatives 4.0 license +noncommercial noderivatives 4.0 +north carolina press +notre dame press +nutzung vervielfältigung bearbeitung +ny cornell university +ny orbis books +ob änderungen vorgenommen +obtain permission directly +online url http +opladen leske budrich +opladen westdeutscher verlag +opladen westdt verlag +otto cramwinckel uitgever +oxford clarendon press +oxford oxford u.p +oxford oxford university +p.i.e peter lang +page intentionally left +paris presses universitaires +pdf letzter zugriff +pdf zuletzt aufgerufen +pdf zuletzt eingesehen +pdf zuletzt geprüft +peter lang frankfurt +phd diss university +phd thesis university +philadelphia john benjamins +press 2019 pp +press cambridge ma +press cambridge mass +press cambridge pp +press doi https +press oxford pp +press washington d.c +princeton n.j princeton +princeton nj princeton +princeton princeton university +publications http csli +quelle eigene berechnungen +quelle eigene darstellung +quelle ordnungsgemäß nennen +samsom uitgeverij alphen +science press doi +share adapted material +sharing adaptation distribution +siehe dazu u.a +siehe reg nr +sites default files +sonstiges drittmaterial unterliegen +springer berlin heidelberg +st martin’s press +stanford ca csli +stanford ca stanford +stanford calif stanford +stanford stanford university +sydney sydney university +toronto barbara budrich +tübingen mohr siebeck +ubiquity press doi +uitgeverij bert bakker +uk cambridge university +uk palgrave macmillan +unpublished phd dissertation +url http bit.ly +verlag barbara budrich +verlag gmbh co.kg +vervielfältigung bearbeitung verbreitung +vis à vis +westminster press doi +wien köln weimar +wissenschaftliche verlagsgesellschaft berlin +york academic press +york basic books +york berghahn books +york cambridge university +york columbia university +york fordham university +york free press +york grove press +york mcgraw hill +york ny oxford +york ny routledge +york oxford university +york palgrave macmillan +york penguin books +york peter lang +york random house +york schocken books +york st martin’s +york vintage international +york zed books +zwolle w.e.j tjeenk +à la fois \ No newline at end of file