Merge branch 'EbookFoundation:main' into main
commit
3392f79665
|
@ -1,31 +1,18 @@
|
||||||
repos:
|
repos:
|
||||||
- repo: local
|
- repo: https://github.com/pycqa/isort
|
||||||
|
rev: 5.10.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
name: isort
|
args: [--profile, black, --filter-files, oapen-engine/src]
|
||||||
stages: [commit]
|
name: isort (python)
|
||||||
language: system
|
- repo: https://github.com/psf/black
|
||||||
entry: cd oapen-engine && pipenv run isort src/
|
rev: stable
|
||||||
types: [python]
|
hooks:
|
||||||
|
|
||||||
- id: black
|
- id: black
|
||||||
name: black
|
language_version: python3.10
|
||||||
stages: [commit]
|
args: [oapen-engine/src]
|
||||||
language: system
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
entry: cd oapen-engine && pipenv run black src/
|
rev: v1.2.3
|
||||||
types: [python]
|
hooks:
|
||||||
|
|
||||||
- id: flake8
|
- id: flake8
|
||||||
name: flake8
|
args: [oapen-engine/src]
|
||||||
stages: [commit]
|
|
||||||
language: system
|
|
||||||
entry: cd oapen-engine && pipenv run flake8 src/
|
|
||||||
types: [python]
|
|
||||||
exclude: setup.py
|
|
||||||
|
|
||||||
- id: pytest
|
|
||||||
name: pytest
|
|
||||||
stages: [commit]
|
|
||||||
language: system
|
|
||||||
entry: cd oapen-engine && pipenv run pytest
|
|
||||||
types: [python]
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
setup-env:
|
||||||
|
ifeq ($(OS),Windows_NT)
|
||||||
|
py -m pip install --upgrade pip
|
||||||
|
else
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
endif
|
||||||
|
pip install pipenv
|
||||||
|
pipenv install
|
||||||
|
pipenv shell
|
||||||
|
|
||||||
|
seed_db:
|
||||||
|
cd src && pipenv run python tasks/seed.py
|
||||||
|
|
||||||
|
clean_db:
|
||||||
|
cd src && pipenv run python tasks/clean.py
|
||||||
|
|
||||||
|
run:
|
||||||
|
cd src && pipenv run python main.py
|
|
@ -1,5 +1,6 @@
|
||||||
# OAPEN Suggestion Service
|
# OAPEN Suggestion Service
|
||||||
## Database Configuration (Local)
|
## Getting Started
|
||||||
|
### Database Configuration (Local)
|
||||||
Create a `database.ini` file in `oapen-engine/src` with the following:
|
Create a `database.ini` file in `oapen-engine/src` with the following:
|
||||||
```
|
```
|
||||||
[postgresql]
|
[postgresql]
|
||||||
|
@ -8,14 +9,21 @@ database=postgres
|
||||||
user=<username>
|
user=<username>
|
||||||
password=<your-password>
|
password=<your-password>
|
||||||
```
|
```
|
||||||
## Running with Pipenv
|
### Environment setup
|
||||||
```
|
```
|
||||||
pipenv install
|
cd oapen-engine
|
||||||
pipenv shell
|
make setup-env
|
||||||
cd src
|
|
||||||
python main.py
|
|
||||||
```
|
```
|
||||||
## Deactivate virtual environment
|
### Seeding the database
|
||||||
|
```
|
||||||
|
make clean_db
|
||||||
|
make seed_db
|
||||||
|
```
|
||||||
|
### Running ngrams
|
||||||
|
```
|
||||||
|
make run
|
||||||
|
```
|
||||||
|
## How to deactivate virtual environment
|
||||||
While the virtual environment is running, type:
|
While the virtual environment is running, type:
|
||||||
```
|
```
|
||||||
deactivate
|
deactivate
|
||||||
|
|
|
@ -2,13 +2,11 @@
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
|
|
||||||
|
|
||||||
def config(filename="../database.ini", section="postgresql"):
|
def config(filename="database.ini", section="postgresql"):
|
||||||
# create a parser
|
|
||||||
parser = ConfigParser()
|
parser = ConfigParser()
|
||||||
# read config file
|
|
||||||
parser.read(filename)
|
parser.read(filename)
|
||||||
|
|
||||||
# get section, default to postgresql
|
|
||||||
db = {}
|
db = {}
|
||||||
if parser.has_section(section):
|
if parser.has_section(section):
|
||||||
params = parser.items(section)
|
params = parser.items(section)
|
||||||
|
|
|
@ -1,39 +1,35 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
import psycopg2
|
import psycopg2
|
||||||
from config import config
|
from data.config import config
|
||||||
|
|
||||||
|
|
||||||
def connect():
|
def get_connection():
|
||||||
"""Connect to the PostgreSQL database server"""
|
|
||||||
conn = None
|
conn = None
|
||||||
try:
|
try:
|
||||||
# read connection parameters
|
|
||||||
params = config()
|
params = config()
|
||||||
|
|
||||||
# connect to the PostgreSQL server
|
|
||||||
print("Connecting to the PostgreSQL database...")
|
print("Connecting to the PostgreSQL database...")
|
||||||
conn = psycopg2.connect(**params)
|
conn = psycopg2.connect(**params)
|
||||||
|
conn.autocommit = True
|
||||||
|
|
||||||
# create a cursor
|
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
|
|
||||||
# execute a statement
|
|
||||||
print("PostgreSQL database version:")
|
print("PostgreSQL database version:")
|
||||||
cur.execute("SELECT version()")
|
cur.execute("SELECT version()")
|
||||||
|
|
||||||
# display the PostgreSQL database server version
|
|
||||||
db_version = cur.fetchone()
|
db_version = cur.fetchone()
|
||||||
print(db_version)
|
print(db_version)
|
||||||
|
|
||||||
# close the communication with the PostgreSQL
|
|
||||||
cur.close()
|
cur.close()
|
||||||
|
|
||||||
except (Exception, psycopg2.DatabaseError) as error:
|
except (Exception, psycopg2.DatabaseError) as error:
|
||||||
print(error)
|
print(error)
|
||||||
finally:
|
finally:
|
||||||
if conn is not None:
|
return conn
|
||||||
conn.close()
|
return conn
|
||||||
print("Database connection closed.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def close_connection(conn):
|
||||||
connect()
|
if conn is not None:
|
||||||
|
conn.close()
|
||||||
|
print("Database connection closed.")
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
from logging import Logger
|
||||||
|
|
||||||
|
from model.oapen_types import OapenSuggestion
|
||||||
|
|
||||||
|
|
||||||
|
def table_exists(connection, table):
|
||||||
|
cursor = connection.cursor
|
||||||
|
cursor.execute(
|
||||||
|
"select exists(select * from oapen_suggestions.tables where table_name=%s)",
|
||||||
|
(table),
|
||||||
|
)
|
||||||
|
|
||||||
|
res = cursor.fetchone()[0]
|
||||||
|
cursor.close()
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def add_suggestion(connection, suggestion: OapenSuggestion) -> None:
|
||||||
|
cursor = connection.cursor()
|
||||||
|
|
||||||
|
query = """
|
||||||
|
INSERT INTO oapen_suggestions.suggestions VALUES (%s, %s, %s)
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor.execute(query, suggestion)
|
||||||
|
except Exception as ex:
|
||||||
|
Logger.exception(ex)
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
def add_many_suggestions(connection, suggestions) -> None:
|
||||||
|
cursor = connection.cursor()
|
||||||
|
|
||||||
|
args_str = ",".join(
|
||||||
|
cursor.mogrify("(%s,%s,%s::suggestion[])", x).decode("utf-8")
|
||||||
|
for x in suggestions
|
||||||
|
)
|
||||||
|
|
||||||
|
query = f"""
|
||||||
|
INSERT INTO oapen_suggestions.suggestions VALUES {args_str}
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor.execute(query)
|
||||||
|
except Exception as ex:
|
||||||
|
Logger.exception(ex)
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
|
@ -1,14 +1,13 @@
|
||||||
import string
|
import string
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import nltk
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
import data.oapen as OapenAPI
|
import data.oapen as OapenAPI
|
||||||
import lib.stopwords as oapen_stopwords
|
import lib.stopwords as oapen_stopwords
|
||||||
|
import nltk
|
||||||
|
import pandas as pd
|
||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
from .oapen_types import OapenItem, transform_item_data
|
from .oapen_types import OapenItem, transform_item_data
|
||||||
|
|
||||||
nltk.download("stopwords")
|
nltk.download("stopwords")
|
||||||
|
|
|
@ -20,6 +20,9 @@ class OapenItem:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
OapenSuggestion = (str, int)
|
||||||
|
|
||||||
|
|
||||||
def transform_item_data(data) -> OapenItem:
|
def transform_item_data(data) -> OapenItem:
|
||||||
uuid = data["uuid"]
|
uuid = data["uuid"]
|
||||||
name = data["name"]
|
name = data["name"]
|
||||||
|
|
|
@ -0,0 +1,435 @@
|
||||||
|
stopwords_filter = [
|
||||||
|
"4.0 deed.de",
|
||||||
|
"commons attribution",
|
||||||
|
"commons lizenz",
|
||||||
|
"commons namensnennung",
|
||||||
|
"creative commons",
|
||||||
|
"creativecommons",
|
||||||
|
"creativecommons.org",
|
||||||
|
"doi 10",
|
||||||
|
"doi.org",
|
||||||
|
"hrsg",
|
||||||
|
"isbn 13 978",
|
||||||
|
"isbn 3",
|
||||||
|
"isbn 978",
|
||||||
|
"noderivatives",
|
||||||
|
"noderivs",
|
||||||
|
"noncommercial",
|
||||||
|
"orcid.org",
|
||||||
|
"springer cham",
|
||||||
|
"university press",
|
||||||
|
]
|
||||||
|
|
||||||
|
# A list of stop words, connected to either open licenses or publishers
|
||||||
|
stopwords_publisher = [
|
||||||
|
"2010 iwa publishing",
|
||||||
|
"2015 http www",
|
||||||
|
"4.0 deed.de veröffentli",
|
||||||
|
"4.0 international license",
|
||||||
|
"4.0 international lizenz",
|
||||||
|
"academic studies press",
|
||||||
|
"access book chapter",
|
||||||
|
"access chapter distributed",
|
||||||
|
"adapted material derived",
|
||||||
|
"akademie verlag berlin",
|
||||||
|
"albany suny press",
|
||||||
|
"amer math soc",
|
||||||
|
"american economic review",
|
||||||
|
"amsterdam amsterdam university",
|
||||||
|
"amsterdam john benjamins",
|
||||||
|
"amsterdam philadelphia john",
|
||||||
|
"angeben ob änderungen",
|
||||||
|
"aosis cape town",
|
||||||
|
"attribution 4.0 international",
|
||||||
|
"attribution licence cc",
|
||||||
|
"attribution noncommercial noderivatives",
|
||||||
|
"attribution noncommercial noderivs",
|
||||||
|
"auckland auckland university",
|
||||||
|
"baden baden nomos",
|
||||||
|
"baltimore johns hopkins",
|
||||||
|
"baltimore md johns",
|
||||||
|
"basingstoke palgrave macmillan",
|
||||||
|
"basingstoke uk palgrave",
|
||||||
|
"benjamins publishing company",
|
||||||
|
"berlin akademie verlag",
|
||||||
|
"berlin heidelberg springer",
|
||||||
|
"berlin language science",
|
||||||
|
"berlin springer verlag",
|
||||||
|
"bielefeld transcript verlag",
|
||||||
|
"bloomington indiana university",
|
||||||
|
"boom juridische uitgevers",
|
||||||
|
"bridge cambridge university",
|
||||||
|
"brill nv leiden",
|
||||||
|
"british medical journal",
|
||||||
|
"brunswick nj rutgers",
|
||||||
|
"böhlau verlag ges.m.b.h",
|
||||||
|
"böhlau verlag gmbh",
|
||||||
|
"ca csli publications",
|
||||||
|
"ca stanford university",
|
||||||
|
"cambridge cambridge university",
|
||||||
|
"cambridge eng cambridge",
|
||||||
|
"cambridge harvard university",
|
||||||
|
"cambridge ma harvard",
|
||||||
|
"cambridge ma london",
|
||||||
|
"cambridge mass harvard",
|
||||||
|
"cambridge polity press",
|
||||||
|
"cambridge uk cambridge",
|
||||||
|
"clarendon press oxford",
|
||||||
|
"co.kg wien köln",
|
||||||
|
"commons attribution noncommercial",
|
||||||
|
"commons lizenz sofern",
|
||||||
|
"ct yale university",
|
||||||
|
"delhi oxford university",
|
||||||
|
"doi 63 9789004",
|
||||||
|
"doi https doi",
|
||||||
|
"drittmaterial unterliegen ebenfalls",
|
||||||
|
"durham duke university",
|
||||||
|
"durham n.c duke",
|
||||||
|
"durham nc duke",
|
||||||
|
"earth syst sci",
|
||||||
|
"edinburgh edinburgh university",
|
||||||
|
"eds tacas 2020",
|
||||||
|
"evanston il northwestern",
|
||||||
|
"evanston ill northwestern",
|
||||||
|
"format erlaubt sofern",
|
||||||
|
"frankfurt a.m campus",
|
||||||
|
"frankfurt a.m fischer",
|
||||||
|
"frankfurt a.m suhr",
|
||||||
|
"frankfurt a.m suhrkamp",
|
||||||
|
"frankfurt main suhrkamp",
|
||||||
|
"franz steiner verlag",
|
||||||
|
"gent academia press",
|
||||||
|
"gesetzlichen vorschriften erlaubt",
|
||||||
|
"gmbh co.kg wien",
|
||||||
|
"grand rapids mi",
|
||||||
|
"gravenhage sdu uitgeverij",
|
||||||
|
"groningen wolters noordhoff",
|
||||||
|
"göttingen vandenhoeck ruprecht",
|
||||||
|
"h.d tjeenk willink",
|
||||||
|
"haag boom juridische",
|
||||||
|
"haag boom lemma",
|
||||||
|
"haag sdu uitgeverij",
|
||||||
|
"haag sdu uitgevers",
|
||||||
|
"harvard business review",
|
||||||
|
"haven ct yale",
|
||||||
|
"haven yale university",
|
||||||
|
"hong princeton n.j",
|
||||||
|
"html letzter zugriff",
|
||||||
|
"html zuletzt abgerufen",
|
||||||
|
"html zuletzt geprüft",
|
||||||
|
"http csli publications.stanford.edu",
|
||||||
|
"http www.youtube.com watch",
|
||||||
|
"https en.wikipedia.org wiki",
|
||||||
|
"https www.youtube.com watch",
|
||||||
|
"intentionally left blank",
|
||||||
|
"international license http",
|
||||||
|
"international lizenz http",
|
||||||
|
"ithaca cornell university",
|
||||||
|
"ithaca ny cornell",
|
||||||
|
"iwa publishing london",
|
||||||
|
"jeweiligen rechteinhabers einzuholen",
|
||||||
|
"john benjamins doi",
|
||||||
|
"john benjamins publishing",
|
||||||
|
"john wiley sons",
|
||||||
|
"johns hopkins university",
|
||||||
|
"kapitel enthaltenen bilder",
|
||||||
|
"kluwer academic publishers",
|
||||||
|
"kluwer law international",
|
||||||
|
"koninklijke brill nv",
|
||||||
|
"köln weimar wien",
|
||||||
|
"language science press",
|
||||||
|
"leiden boston brill",
|
||||||
|
"leiden e.j brill",
|
||||||
|
"leiden kitlv press",
|
||||||
|
"leiden stenfert kroese",
|
||||||
|
"london duke university",
|
||||||
|
"london i.b tauris",
|
||||||
|
"london john murray",
|
||||||
|
"london oxford university",
|
||||||
|
"london penguin books",
|
||||||
|
"london pluto press",
|
||||||
|
"london routledge kegan",
|
||||||
|
"london routledge pp",
|
||||||
|
"london sage publications",
|
||||||
|
"london ubiquity press",
|
||||||
|
"london ucl press",
|
||||||
|
"london zed books",
|
||||||
|
"ma harvard university",
|
||||||
|
"main peter lang",
|
||||||
|
"manchester manchester university",
|
||||||
|
"mass harvard university",
|
||||||
|
"medizinisch wissenschaftliche verlagsgesellschaft",
|
||||||
|
"mohr paul siebeck",
|
||||||
|
"mohr siebeck tübingen",
|
||||||
|
"münchen wilhelm fink",
|
||||||
|
"n.j princeton university",
|
||||||
|
"namensnennung 4.0 international",
|
||||||
|
"national lizenz http",
|
||||||
|
"nc 4.0 license",
|
||||||
|
"nc duke university",
|
||||||
|
"nc sa 4.0",
|
||||||
|
"nj prentice hall",
|
||||||
|
"nj prince ton",
|
||||||
|
"nj princeton university",
|
||||||
|
"nj rutgers university",
|
||||||
|
"noderivatives 4.0 license",
|
||||||
|
"noncommercial noderivatives 4.0",
|
||||||
|
"north carolina press",
|
||||||
|
"notre dame press",
|
||||||
|
"nutzung vervielfältigung bearbeitung",
|
||||||
|
"ny cornell university",
|
||||||
|
"ny orbis books",
|
||||||
|
"ob änderungen vorgenommen",
|
||||||
|
"obtain permission directly",
|
||||||
|
"online url http",
|
||||||
|
"opladen leske budrich",
|
||||||
|
"opladen westdeutscher verlag",
|
||||||
|
"opladen westdt verlag",
|
||||||
|
"otto cramwinckel uitgever",
|
||||||
|
"oxford clarendon press",
|
||||||
|
"oxford oxford u.p",
|
||||||
|
"oxford oxford university",
|
||||||
|
"p.i.e peter lang",
|
||||||
|
"page intentionally left",
|
||||||
|
"paris presses universitaires",
|
||||||
|
"pdf letzter zugriff",
|
||||||
|
"pdf zuletzt aufgerufen",
|
||||||
|
"pdf zuletzt eingesehen",
|
||||||
|
"pdf zuletzt geprüft",
|
||||||
|
"peter lang frankfurt",
|
||||||
|
"phd diss university",
|
||||||
|
"phd thesis university",
|
||||||
|
"philadelphia john benjamins",
|
||||||
|
"press 2019 pp",
|
||||||
|
"press cambridge ma",
|
||||||
|
"press cambridge mass",
|
||||||
|
"press cambridge pp",
|
||||||
|
"press doi https",
|
||||||
|
"press oxford pp",
|
||||||
|
"press washington d.c",
|
||||||
|
"princeton n.j princeton",
|
||||||
|
"princeton nj princeton",
|
||||||
|
"princeton princeton university",
|
||||||
|
"publications http csli",
|
||||||
|
"quelle eigene berechnungen",
|
||||||
|
"quelle eigene darstellung",
|
||||||
|
"quelle ordnungsgemäß nennen",
|
||||||
|
"samsom uitgeverij alphen",
|
||||||
|
"science press doi",
|
||||||
|
"share adapted material",
|
||||||
|
"sharing adaptation distribution",
|
||||||
|
"siehe dazu u.a",
|
||||||
|
"siehe reg nr",
|
||||||
|
"sites default files",
|
||||||
|
"sonstiges drittmaterial unterliegen",
|
||||||
|
"springer berlin heidelberg",
|
||||||
|
"st martin’s press",
|
||||||
|
"stanford ca csli",
|
||||||
|
"stanford ca stanford",
|
||||||
|
"stanford calif stanford",
|
||||||
|
"stanford stanford university",
|
||||||
|
"sydney sydney university",
|
||||||
|
"toronto barbara budrich",
|
||||||
|
"tübingen mohr siebeck",
|
||||||
|
"ubiquity press doi",
|
||||||
|
"uitgeverij bert bakker",
|
||||||
|
"uk cambridge university",
|
||||||
|
"uk palgrave macmillan",
|
||||||
|
"unpublished phd dissertation",
|
||||||
|
"url http bit.ly",
|
||||||
|
"verlag barbara budrich",
|
||||||
|
"verlag gmbh co.kg",
|
||||||
|
"vervielfältigung bearbeitung verbreitung",
|
||||||
|
"vis à vis",
|
||||||
|
"westminster press doi",
|
||||||
|
"wien köln weimar",
|
||||||
|
"wissenschaftliche verlagsgesellschaft berlin",
|
||||||
|
"york academic press",
|
||||||
|
"york basic books",
|
||||||
|
"york berghahn books",
|
||||||
|
"york cambridge university",
|
||||||
|
"york columbia university",
|
||||||
|
"york fordham university",
|
||||||
|
"york free press",
|
||||||
|
"york grove press",
|
||||||
|
"york mcgraw hill",
|
||||||
|
"york ny oxford",
|
||||||
|
"york ny routledge",
|
||||||
|
"york oxford university",
|
||||||
|
"york palgrave macmillan",
|
||||||
|
"york penguin books",
|
||||||
|
"york peter lang",
|
||||||
|
"york random house",
|
||||||
|
"york schocken books",
|
||||||
|
"york st martin’s",
|
||||||
|
"york vintage international",
|
||||||
|
"york zed books",
|
||||||
|
"zwolle w.e.j tjeenk",
|
||||||
|
"à la fois",
|
||||||
|
]
|
||||||
|
|
||||||
|
# A list of 'broken' words, not to be used as trigrams
|
||||||
|
stopwords_broken = [
|
||||||
|
"ac tuele bestaansonzekerheid",
|
||||||
|
"bedeutungsexpl ikat ion",
|
||||||
|
"bedeutungsexpl ikat ionen",
|
||||||
|
"bes ted ingen",
|
||||||
|
"biopo liti cal",
|
||||||
|
"bl att mitt",
|
||||||
|
"box offi ce",
|
||||||
|
"ca rib bean",
|
||||||
|
"ca ribb ean",
|
||||||
|
"capital fl ows",
|
||||||
|
"car ib bean",
|
||||||
|
"car ibb ean",
|
||||||
|
"chris tian ity",
|
||||||
|
"chris tiani ty",
|
||||||
|
"christ ian ity",
|
||||||
|
"christ iani ty",
|
||||||
|
"connec ted curr",
|
||||||
|
"con tac ten",
|
||||||
|
"consumptieve bes ted",
|
||||||
|
"cont ro le",
|
||||||
|
"dif ere nt",
|
||||||
|
"dif fer ent",
|
||||||
|
"dif fere nt",
|
||||||
|
"diff ere nt",
|
||||||
|
"diff erent kinds",
|
||||||
|
"direct eff ect",
|
||||||
|
"doe le inden",
|
||||||
|
"empir ische untersuchung",
|
||||||
|
"esp rä ch",
|
||||||
|
"est äti gt",
|
||||||
|
"eu ro pe",
|
||||||
|
"eu ro pean",
|
||||||
|
"eu rop ean",
|
||||||
|
"evo lu tion",
|
||||||
|
"fe mi nist",
|
||||||
|
"fi ction publishers",
|
||||||
|
"fi ction publishing",
|
||||||
|
"fi lm noir",
|
||||||
|
"fi lm theory",
|
||||||
|
"fi losofi sche",
|
||||||
|
"fi nal section",
|
||||||
|
"fi nancial crisis",
|
||||||
|
"fi nancial markets",
|
||||||
|
"fi rm level",
|
||||||
|
"fi rst book",
|
||||||
|
"fi rst century",
|
||||||
|
"fi rst chapter",
|
||||||
|
"fi rst generation",
|
||||||
|
"fi rst muve",
|
||||||
|
"fi rst person",
|
||||||
|
"fi rst principle",
|
||||||
|
"fi rst time",
|
||||||
|
"fi ve yearly",
|
||||||
|
"fl ow cell",
|
||||||
|
"fl ow direction",
|
||||||
|
"fl ow regime",
|
||||||
|
"fl ow velocities",
|
||||||
|
"fo ku ss",
|
||||||
|
"fo lg lic",
|
||||||
|
"fo lk lo",
|
||||||
|
"fo lk poetry",
|
||||||
|
"fo rm ation",
|
||||||
|
"fo rm ationen",
|
||||||
|
"foreign offi ce",
|
||||||
|
"ga ni za",
|
||||||
|
"ge legenhe id",
|
||||||
|
"geopo liti cal",
|
||||||
|
"geopol iti cal",
|
||||||
|
"government offi cials",
|
||||||
|
"gr aph ic",
|
||||||
|
"hu id ige",
|
||||||
|
"ia le zaken",
|
||||||
|
"ial med ia",
|
||||||
|
"ic ul ar",
|
||||||
|
"idsbe le id",
|
||||||
|
"iebe le id",
|
||||||
|
"iesa menlev ing",
|
||||||
|
"ind ica toren",
|
||||||
|
"ind iv iduen",
|
||||||
|
"inf luence attempts",
|
||||||
|
"ingsbe le id",
|
||||||
|
"inso fe rn",
|
||||||
|
"inves ter ingen",
|
||||||
|
"ir ish women",
|
||||||
|
"ite ra tu",
|
||||||
|
"ite ra tur",
|
||||||
|
"jens mar tin",
|
||||||
|
"knowledge pol icy",
|
||||||
|
"le id ig",
|
||||||
|
"le id ing",
|
||||||
|
"leading cit ies",
|
||||||
|
"lex ibe le",
|
||||||
|
"lie ß lic",
|
||||||
|
"lit era ture",
|
||||||
|
"lite ra ture",
|
||||||
|
"lm ä ß",
|
||||||
|
"local fi rms",
|
||||||
|
"maat rege len",
|
||||||
|
"mar tin gurr",
|
||||||
|
"mitt el alt",
|
||||||
|
"mo ve ment",
|
||||||
|
"nat iona le",
|
||||||
|
"ni za tion",
|
||||||
|
"nst verle ning",
|
||||||
|
"offi cial nationality",
|
||||||
|
"onde rzoek ingen",
|
||||||
|
"pa rt ii",
|
||||||
|
"pa rt iii",
|
||||||
|
"par tic ul",
|
||||||
|
"par ticu lar",
|
||||||
|
"perf orm ance",
|
||||||
|
"po liti cal",
|
||||||
|
"pol iti cal",
|
||||||
|
"pos si ble",
|
||||||
|
"pos sib le",
|
||||||
|
"poss ib le",
|
||||||
|
"publ ic archaeology",
|
||||||
|
"pulp fi ction",
|
||||||
|
"repre sen ta",
|
||||||
|
"ro pean union",
|
||||||
|
"rzäh le rs",
|
||||||
|
"samen lev ing",
|
||||||
|
"science fi ction",
|
||||||
|
"scientif ic research",
|
||||||
|
"sen ta tion",
|
||||||
|
"sen ta tions",
|
||||||
|
"sl agva ardigheid",
|
||||||
|
"soc ia le",
|
||||||
|
"soc ial med",
|
||||||
|
"sp ra ch",
|
||||||
|
"sp rach lich",
|
||||||
|
"sp rach liche",
|
||||||
|
"sp rach lichen",
|
||||||
|
"ssp ra ch",
|
||||||
|
"st err ei",
|
||||||
|
"subs id ies",
|
||||||
|
"ted curr iculum",
|
||||||
|
"tele vi sion",
|
||||||
|
"tic ul ar",
|
||||||
|
"tive commons license",
|
||||||
|
"tu ra lis",
|
||||||
|
"tu ra lly",
|
||||||
|
"twenty fi rst",
|
||||||
|
"twenty fi ve",
|
||||||
|
"ty lis tic",
|
||||||
|
"uoo rbee ld",
|
||||||
|
"ve rhoud ing",
|
||||||
|
"ve rö ffe",
|
||||||
|
"veren ig ing",
|
||||||
|
"vi sion dramas",
|
||||||
|
"voorz ien ingen",
|
||||||
|
"werkge legenhe id",
|
||||||
|
"wi jz ig",
|
||||||
|
"zusä tz lich",
|
||||||
|
"ä sthe tik",
|
||||||
|
"ä tz lic",
|
||||||
|
"ä ß ig",
|
||||||
|
"ö st err",
|
||||||
|
"öst err ur",
|
||||||
|
]
|
||||||
|
|
||||||
|
# A list of Dutch stop words, not part of stopwords_nl
|
||||||
|
stopwords_dutch_extra = ["een", "te", "ten"]
|
|
@ -0,0 +1,39 @@
|
||||||
|
from data.connection import get_connection
|
||||||
|
|
||||||
|
|
||||||
|
def create_schema(connection) -> None:
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
CREATE TYPE suggestion AS (id uuid, rank int);
|
||||||
|
CREATE SCHEMA oapen_suggestions
|
||||||
|
CREATE TABLE IF NOT EXISTS suggestions (
|
||||||
|
item_id uuid PRIMARY KEY,
|
||||||
|
name text,
|
||||||
|
suggestions suggestion[]
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
def drop_schema(connection) -> None:
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
DROP SCHEMA IF EXISTS oapen_suggestions CASCADE;
|
||||||
|
DROP TABLE IF EXISTS suggestions CASCADE;
|
||||||
|
DROP TYPE IF EXISTS suggestion CASCADE;
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
connection = get_connection()
|
||||||
|
|
||||||
|
drop_schema(connection)
|
||||||
|
create_schema(connection)
|
||||||
|
|
||||||
|
connection.close()
|
|
@ -0,0 +1,23 @@
|
||||||
|
import data.oapen as OapenAPI
|
||||||
|
from data.connection import get_connection
|
||||||
|
from data.oapen_db import add_many_suggestions
|
||||||
|
from model.oapen_types import transform_item_data
|
||||||
|
|
||||||
|
|
||||||
|
def mock_suggestion_rows(n=10):
|
||||||
|
items = OapenAPI.get_items_from_collection("5f664493-8fee-465a-9c22-7ea8e0595775")
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for i in range(min(10, len(items))):
|
||||||
|
item = transform_item_data(OapenAPI.get_item(items[i]))
|
||||||
|
rows.append((items[i], item.name, [(items[i], i)]))
|
||||||
|
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
connection = get_connection()
|
||||||
|
|
||||||
|
rows = mock_suggestion_rows(connection)
|
||||||
|
add_many_suggestions(connection, rows)
|
||||||
|
|
||||||
|
connection.close()
|
|
@ -8,7 +8,7 @@ skip=[lib/]
|
||||||
profile=black
|
profile=black
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E266, E501, W503
|
ignore = E203, E266, E501, W503, E501
|
||||||
max-line-length = 88
|
max-line-length = 88
|
||||||
max-complexity = 18
|
max-complexity = 18
|
||||||
select = B,C,E,F,W,T4
|
select = B,C,E,F,W,T4
|
Loading…
Reference in New Issue