Merge branch 'EbookFoundation:main' into main

pull/15/head
Celina Peralta 2022-10-11 14:11:41 -04:00 committed by GitHub
commit 3392f79665
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 612 additions and 56 deletions

View File

@ -1,31 +1,18 @@
repos:
- repo: local
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
- id: isort
name: isort
stages: [commit]
language: system
entry: cd oapen-engine && pipenv run isort src/
types: [python]
args: [--profile, black, --filter-files, oapen-engine/src]
name: isort (python)
- repo: https://github.com/psf/black
rev: stable
hooks:
- id: black
name: black
stages: [commit]
language: system
entry: cd oapen-engine && pipenv run black src/
types: [python]
language_version: python3.10
args: [oapen-engine/src]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
hooks:
- id: flake8
name: flake8
stages: [commit]
language: system
entry: cd oapen-engine && pipenv run flake8 src/
types: [python]
exclude: setup.py
- id: pytest
name: pytest
stages: [commit]
language: system
entry: cd oapen-engine && pipenv run pytest
types: [python]
args: [oapen-engine/src]

18
oapen-engine/Makefile Normal file
View File

@ -0,0 +1,18 @@
setup-env:
ifeq ($(OS),Windows_NT)
py -m pip install --upgrade pip
else
python -m pip install --upgrade pip
endif
pip install pipenv
pipenv install
pipenv shell
seed_db:
cd src && pipenv run python tasks/seed.py
clean_db:
cd src && pipenv run python tasks/clean.py
run:
cd src && pipenv run python main.py

View File

@ -1,5 +1,6 @@
# OAPEN Suggestion Service
## Database Configuration (Local)
## Getting Started
### Database Configuration (Local)
Create a `database.ini` file in `oapen-engine/src` with the following:
```
[postgresql]
@ -8,14 +9,21 @@ database=postgres
user=<username>
password=<your-password>
```
## Running with Pipenv
### Environment setup
```
pipenv install
pipenv shell
cd src
python main.py
cd oapen-engine
make setup-env
```
## Deactivate virtual environment
### Seeding the database
```
make clean_db
make seed_db
```
### Running ngrams
```
make run
```
## How to deactivate virtual environment
While the virtual environment is running, type:
```
deactivate

View File

@ -2,13 +2,11 @@
from configparser import ConfigParser
def config(filename="../database.ini", section="postgresql"):
# create a parser
def config(filename="database.ini", section="postgresql"):
parser = ConfigParser()
# read config file
parser.read(filename)
# get section, default to postgresql
db = {}
if parser.has_section(section):
params = parser.items(section)

View File

@ -1,39 +1,35 @@
#!/usr/bin/python
import psycopg2
from config import config
from data.config import config
def connect():
"""Connect to the PostgreSQL database server"""
def get_connection():
conn = None
try:
# read connection parameters
params = config()
# connect to the PostgreSQL server
print("Connecting to the PostgreSQL database...")
conn = psycopg2.connect(**params)
conn.autocommit = True
# create a cursor
cur = conn.cursor()
# execute a statement
print("PostgreSQL database version:")
cur.execute("SELECT version()")
# display the PostgreSQL database server version
db_version = cur.fetchone()
print(db_version)
# close the communication with the PostgreSQL
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print("Database connection closed.")
return conn
return conn
if __name__ == "__main__":
connect()
def close_connection(conn):
if conn is not None:
conn.close()
print("Database connection closed.")

View File

@ -0,0 +1,50 @@
from logging import Logger
from model.oapen_types import OapenSuggestion
def table_exists(connection, table):
cursor = connection.cursor
cursor.execute(
"select exists(select * from oapen_suggestions.tables where table_name=%s)",
(table),
)
res = cursor.fetchone()[0]
cursor.close()
return res
def add_suggestion(connection, suggestion: OapenSuggestion) -> None:
cursor = connection.cursor()
query = """
INSERT INTO oapen_suggestions.suggestions VALUES (%s, %s, %s)
"""
try:
cursor.execute(query, suggestion)
except Exception as ex:
Logger.exception(ex)
finally:
cursor.close()
def add_many_suggestions(connection, suggestions) -> None:
cursor = connection.cursor()
args_str = ",".join(
cursor.mogrify("(%s,%s,%s::suggestion[])", x).decode("utf-8")
for x in suggestions
)
query = f"""
INSERT INTO oapen_suggestions.suggestions VALUES {args_str}
"""
try:
cursor.execute(query)
except Exception as ex:
Logger.exception(ex)
finally:
cursor.close()

View File

@ -1,14 +1,13 @@
import string
from typing import List
import nltk
import pandas as pd
import data.oapen as OapenAPI
import lib.stopwords as oapen_stopwords
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from .oapen_types import OapenItem, transform_item_data
nltk.download("stopwords")

View File

@ -20,6 +20,9 @@ class OapenItem:
return ""
OapenSuggestion = (str, int)
def transform_item_data(data) -> OapenItem:
uuid = data["uuid"]
name = data["name"]

View File

@ -0,0 +1,435 @@
stopwords_filter = [
"4.0 deed.de",
"commons attribution",
"commons lizenz",
"commons namensnennung",
"creative commons",
"creativecommons",
"creativecommons.org",
"doi 10",
"doi.org",
"hrsg",
"isbn 13 978",
"isbn 3",
"isbn 978",
"noderivatives",
"noderivs",
"noncommercial",
"orcid.org",
"springer cham",
"university press",
]
# A list of stop words, connected to either open licenses or publishers
stopwords_publisher = [
"2010 iwa publishing",
"2015 http www",
"4.0 deed.de veröffentli",
"4.0 international license",
"4.0 international lizenz",
"academic studies press",
"access book chapter",
"access chapter distributed",
"adapted material derived",
"akademie verlag berlin",
"albany suny press",
"amer math soc",
"american economic review",
"amsterdam amsterdam university",
"amsterdam john benjamins",
"amsterdam philadelphia john",
"angeben ob änderungen",
"aosis cape town",
"attribution 4.0 international",
"attribution licence cc",
"attribution noncommercial noderivatives",
"attribution noncommercial noderivs",
"auckland auckland university",
"baden baden nomos",
"baltimore johns hopkins",
"baltimore md johns",
"basingstoke palgrave macmillan",
"basingstoke uk palgrave",
"benjamins publishing company",
"berlin akademie verlag",
"berlin heidelberg springer",
"berlin language science",
"berlin springer verlag",
"bielefeld transcript verlag",
"bloomington indiana university",
"boom juridische uitgevers",
"bridge cambridge university",
"brill nv leiden",
"british medical journal",
"brunswick nj rutgers",
"böhlau verlag ges.m.b.h",
"böhlau verlag gmbh",
"ca csli publications",
"ca stanford university",
"cambridge cambridge university",
"cambridge eng cambridge",
"cambridge harvard university",
"cambridge ma harvard",
"cambridge ma london",
"cambridge mass harvard",
"cambridge polity press",
"cambridge uk cambridge",
"clarendon press oxford",
"co.kg wien köln",
"commons attribution noncommercial",
"commons lizenz sofern",
"ct yale university",
"delhi oxford university",
"doi 63 9789004",
"doi https doi",
"drittmaterial unterliegen ebenfalls",
"durham duke university",
"durham n.c duke",
"durham nc duke",
"earth syst sci",
"edinburgh edinburgh university",
"eds tacas 2020",
"evanston il northwestern",
"evanston ill northwestern",
"format erlaubt sofern",
"frankfurt a.m campus",
"frankfurt a.m fischer",
"frankfurt a.m suhr",
"frankfurt a.m suhrkamp",
"frankfurt main suhrkamp",
"franz steiner verlag",
"gent academia press",
"gesetzlichen vorschriften erlaubt",
"gmbh co.kg wien",
"grand rapids mi",
"gravenhage sdu uitgeverij",
"groningen wolters noordhoff",
"göttingen vandenhoeck ruprecht",
"h.d tjeenk willink",
"haag boom juridische",
"haag boom lemma",
"haag sdu uitgeverij",
"haag sdu uitgevers",
"harvard business review",
"haven ct yale",
"haven yale university",
"hong princeton n.j",
"html letzter zugriff",
"html zuletzt abgerufen",
"html zuletzt geprüft",
"http csli publications.stanford.edu",
"http www.youtube.com watch",
"https en.wikipedia.org wiki",
"https www.youtube.com watch",
"intentionally left blank",
"international license http",
"international lizenz http",
"ithaca cornell university",
"ithaca ny cornell",
"iwa publishing london",
"jeweiligen rechteinhabers einzuholen",
"john benjamins doi",
"john benjamins publishing",
"john wiley sons",
"johns hopkins university",
"kapitel enthaltenen bilder",
"kluwer academic publishers",
"kluwer law international",
"koninklijke brill nv",
"köln weimar wien",
"language science press",
"leiden boston brill",
"leiden e.j brill",
"leiden kitlv press",
"leiden stenfert kroese",
"london duke university",
"london i.b tauris",
"london john murray",
"london oxford university",
"london penguin books",
"london pluto press",
"london routledge kegan",
"london routledge pp",
"london sage publications",
"london ubiquity press",
"london ucl press",
"london zed books",
"ma harvard university",
"main peter lang",
"manchester manchester university",
"mass harvard university",
"medizinisch wissenschaftliche verlagsgesellschaft",
"mohr paul siebeck",
"mohr siebeck tübingen",
"münchen wilhelm fink",
"n.j princeton university",
"namensnennung 4.0 international",
"national lizenz http",
"nc 4.0 license",
"nc duke university",
"nc sa 4.0",
"nj prentice hall",
"nj prince ton",
"nj princeton university",
"nj rutgers university",
"noderivatives 4.0 license",
"noncommercial noderivatives 4.0",
"north carolina press",
"notre dame press",
"nutzung vervielfältigung bearbeitung",
"ny cornell university",
"ny orbis books",
"ob änderungen vorgenommen",
"obtain permission directly",
"online url http",
"opladen leske budrich",
"opladen westdeutscher verlag",
"opladen westdt verlag",
"otto cramwinckel uitgever",
"oxford clarendon press",
"oxford oxford u.p",
"oxford oxford university",
"p.i.e peter lang",
"page intentionally left",
"paris presses universitaires",
"pdf letzter zugriff",
"pdf zuletzt aufgerufen",
"pdf zuletzt eingesehen",
"pdf zuletzt geprüft",
"peter lang frankfurt",
"phd diss university",
"phd thesis university",
"philadelphia john benjamins",
"press 2019 pp",
"press cambridge ma",
"press cambridge mass",
"press cambridge pp",
"press doi https",
"press oxford pp",
"press washington d.c",
"princeton n.j princeton",
"princeton nj princeton",
"princeton princeton university",
"publications http csli",
"quelle eigene berechnungen",
"quelle eigene darstellung",
"quelle ordnungsgemäß nennen",
"samsom uitgeverij alphen",
"science press doi",
"share adapted material",
"sharing adaptation distribution",
"siehe dazu u.a",
"siehe reg nr",
"sites default files",
"sonstiges drittmaterial unterliegen",
"springer berlin heidelberg",
"st martins press",
"stanford ca csli",
"stanford ca stanford",
"stanford calif stanford",
"stanford stanford university",
"sydney sydney university",
"toronto barbara budrich",
"tübingen mohr siebeck",
"ubiquity press doi",
"uitgeverij bert bakker",
"uk cambridge university",
"uk palgrave macmillan",
"unpublished phd dissertation",
"url http bit.ly",
"verlag barbara budrich",
"verlag gmbh co.kg",
"vervielfältigung bearbeitung verbreitung",
"vis à vis",
"westminster press doi",
"wien köln weimar",
"wissenschaftliche verlagsgesellschaft berlin",
"york academic press",
"york basic books",
"york berghahn books",
"york cambridge university",
"york columbia university",
"york fordham university",
"york free press",
"york grove press",
"york mcgraw hill",
"york ny oxford",
"york ny routledge",
"york oxford university",
"york palgrave macmillan",
"york penguin books",
"york peter lang",
"york random house",
"york schocken books",
"york st martins",
"york vintage international",
"york zed books",
"zwolle w.e.j tjeenk",
"à la fois",
]
# A list of 'broken' words, not to be used as trigrams
stopwords_broken = [
"ac tuele bestaansonzekerheid",
"bedeutungsexpl ikat ion",
"bedeutungsexpl ikat ionen",
"bes ted ingen",
"biopo liti cal",
"bl att mitt",
"box offi ce",
"ca rib bean",
"ca ribb ean",
"capital fl ows",
"car ib bean",
"car ibb ean",
"chris tian ity",
"chris tiani ty",
"christ ian ity",
"christ iani ty",
"connec ted curr",
"con tac ten",
"consumptieve bes ted",
"cont ro le",
"dif ere nt",
"dif fer ent",
"dif fere nt",
"diff ere nt",
"diff erent kinds",
"direct eff ect",
"doe le inden",
"empir ische untersuchung",
"esp rä ch",
"est äti gt",
"eu ro pe",
"eu ro pean",
"eu rop ean",
"evo lu tion",
"fe mi nist",
"fi ction publishers",
"fi ction publishing",
"fi lm noir",
"fi lm theory",
"fi losofi sche",
"fi nal section",
"fi nancial crisis",
"fi nancial markets",
"fi rm level",
"fi rst book",
"fi rst century",
"fi rst chapter",
"fi rst generation",
"fi rst muve",
"fi rst person",
"fi rst principle",
"fi rst time",
"fi ve yearly",
"fl ow cell",
"fl ow direction",
"fl ow regime",
"fl ow velocities",
"fo ku ss",
"fo lg lic",
"fo lk lo",
"fo lk poetry",
"fo rm ation",
"fo rm ationen",
"foreign offi ce",
"ga ni za",
"ge legenhe id",
"geopo liti cal",
"geopol iti cal",
"government offi cials",
"gr aph ic",
"hu id ige",
"ia le zaken",
"ial med ia",
"ic ul ar",
"idsbe le id",
"iebe le id",
"iesa menlev ing",
"ind ica toren",
"ind iv iduen",
"inf luence attempts",
"ingsbe le id",
"inso fe rn",
"inves ter ingen",
"ir ish women",
"ite ra tu",
"ite ra tur",
"jens mar tin",
"knowledge pol icy",
"le id ig",
"le id ing",
"leading cit ies",
"lex ibe le",
"lie ß lic",
"lit era ture",
"lite ra ture",
"lm ä ß",
"local fi rms",
"maat rege len",
"mar tin gurr",
"mitt el alt",
"mo ve ment",
"nat iona le",
"ni za tion",
"nst verle ning",
"offi cial nationality",
"onde rzoek ingen",
"pa rt ii",
"pa rt iii",
"par tic ul",
"par ticu lar",
"perf orm ance",
"po liti cal",
"pol iti cal",
"pos si ble",
"pos sib le",
"poss ib le",
"publ ic archaeology",
"pulp fi ction",
"repre sen ta",
"ro pean union",
"rzäh le rs",
"samen lev ing",
"science fi ction",
"scientif ic research",
"sen ta tion",
"sen ta tions",
"sl agva ardigheid",
"soc ia le",
"soc ial med",
"sp ra ch",
"sp rach lich",
"sp rach liche",
"sp rach lichen",
"ssp ra ch",
"st err ei",
"subs id ies",
"ted curr iculum",
"tele vi sion",
"tic ul ar",
"tive commons license",
"tu ra lis",
"tu ra lly",
"twenty fi rst",
"twenty fi ve",
"ty lis tic",
"uoo rbee ld",
"ve rhoud ing",
"ve rö ffe",
"veren ig ing",
"vi sion dramas",
"voorz ien ingen",
"werkge legenhe id",
"wi jz ig",
"zusä tz lich",
"ä sthe tik",
"ä tz lic",
"ä ß ig",
"ö st err",
"öst err ur",
]
# A list of Dutch stop words, not part of stopwords_nl
stopwords_dutch_extra = ["een", "te", "ten"]

View File

@ -0,0 +1,39 @@
from data.connection import get_connection
def create_schema(connection) -> None:
cursor = connection.cursor()
cursor.execute(
"""
CREATE TYPE suggestion AS (id uuid, rank int);
CREATE SCHEMA oapen_suggestions
CREATE TABLE IF NOT EXISTS suggestions (
item_id uuid PRIMARY KEY,
name text,
suggestions suggestion[]
);
"""
)
cursor.close()
def drop_schema(connection) -> None:
cursor = connection.cursor()
cursor.execute(
"""
DROP SCHEMA IF EXISTS oapen_suggestions CASCADE;
DROP TABLE IF EXISTS suggestions CASCADE;
DROP TYPE IF EXISTS suggestion CASCADE;
"""
)
cursor.close()
connection = get_connection()
drop_schema(connection)
create_schema(connection)
connection.close()

View File

@ -0,0 +1,23 @@
import data.oapen as OapenAPI
from data.connection import get_connection
from data.oapen_db import add_many_suggestions
from model.oapen_types import transform_item_data
def mock_suggestion_rows(n=10):
items = OapenAPI.get_items_from_collection("5f664493-8fee-465a-9c22-7ea8e0595775")
rows = []
for i in range(min(10, len(items))):
item = transform_item_data(OapenAPI.get_item(items[i]))
rows.append((items[i], item.name, [(items[i], i)]))
return rows
connection = get_connection()
rows = mock_suggestion_rows(connection)
add_many_suggestions(connection, rows)
connection.close()

View File

@ -8,7 +8,7 @@ skip=[lib/]
profile=black
[flake8]
ignore = E203, E266, E501, W503
ignore = E203, E266, E501, W503, E501
max-line-length = 88
max-complexity = 18
select = B,C,E,F,W,T4