Merge branch 'main' into celinanperalta/OAP-60

celinanperalta/OAP-60
Celina Peralta 2023-04-03 17:03:28 -04:00 committed by GitHub
commit dab7812da4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 377 additions and 187 deletions

View File

@ -1,27 +0,0 @@
name: OAPEN Engine
on: [push]
jobs:
build:
runs-on: ubuntu-latest
env:
working-directory: ./oapen-engine
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install dependencies with pipenv
working-directory: ${{env.working-directory}}
run: |
pip install pipenv
pipenv install --deploy --dev
pipenv run isort --profile black src/
pipenv run black --check src/ --exclude="lib/*"
pipenv run flake8 src/ --ignore="lib/*, W, E203, E266, E501, W503, F403, F401"

View File

@ -1,4 +1,4 @@
name: Web lint checker name: Build and test web
on: push on: push
jobs: jobs:
test: test:

View File

@ -1,4 +1,4 @@
name: Test Containers name: Build and test containers
on: push on: push

View File

@ -4,7 +4,7 @@ The OAPEN Suggestion Engine will suggest ebooks based on other books with simila
## Running server ## Running server
You can run all the servers together with `./all-dev.sh` -- after installing dependencies with `./setup.sh` You can run all the servers together with `./all-dev.sh` -- after installing dependencies with `. ./setup.sh`
## Monorepo components ## Monorepo components
@ -17,6 +17,8 @@ Our suggestion service is centered around the trigram semantic inferencing algor
You can find the code for the mining engine in `oapen-engine/`. You can find the code for the mining engine in `oapen-engine/`.
Information about running the mining engine is in [`oapen-engine/README.md`](oapen-engine/README.md).
**Base dependencies**: **Base dependencies**:
* Python v3.10 * Python v3.10
* PIP package manager * PIP package manager
@ -44,6 +46,8 @@ This API server returns a list of recommended books from the database.
You can find the code for the API engine in `api/`. You can find the code for the API engine in `api/`.
Configuration info for the API engine is in [`api/README.md`](api/README.md).
**Base dependencies**: **Base dependencies**:
* NodeJS 14.x+ * NodeJS 14.x+
* NPM package manager * NPM package manager
@ -64,6 +68,8 @@ This is a web-app demo that can be used to query the API engine and see suggeste
You can find the code for the web demo in `web/`. You can find the code for the web demo in `web/`.
Configuration info for the web demo is in [`web/README.md`](web/README.md).
**Base dependencies**: **Base dependencies**:
* NodeJS 14.x+ * NodeJS 14.x+
* NPM package manager * NPM package manager

0
all-dev.sh Normal file → Executable file
View File

View File

@ -8,11 +8,10 @@ async function querySuggestions(handle, threshold = 0) {
await validate.checkHandle(handle); await validate.checkHandle(handle);
const query = new PQ({ const query = new PQ({
text: `SELECT s.* text: `SELECT suggestion AS handle, score
FROM (SELECT handle, unnest(suggestions::oapen_suggestions.suggestion[]) AS suggestion FROM oapen_suggestions.suggestions
FROM oapen_suggestions.suggestions) s
WHERE handle = $1 WHERE handle = $1
AND (s.suggestion).similarity >= $2`, AND score >= $2`,
values: [handle, threshold], values: [handle, threshold],
}); });
@ -23,9 +22,11 @@ async function querySuggestions(handle, threshold = 0) {
if (result?.["error"]) if (result?.["error"])
return result; return result;
console.log(result);
const data = { const data = {
"handle": handle, "handle": handle,
"suggestions": result.map((e) => {return e["suggestion"];}) "suggestions": result
}; };
return data; return data;

View File

@ -3,7 +3,7 @@ services:
oapen-engine : oapen-engine :
build: ./oapen-engine/ build: ./oapen-engine/
environment: environment:
- RUN_CLEAN=1 - RUN_CLEAN=0
- COLLECTION_IMPORT_LIMIT=0 # Set to 0 for full harvest - COLLECTION_IMPORT_LIMIT=0 # Set to 0 for full harvest
- REFRESH_PERIOD=86400 # daily - REFRESH_PERIOD=86400 # daily
- HARVEST_PERIOD=604800 # weekly - HARVEST_PERIOD=604800 # weekly

View File

@ -48,4 +48,4 @@ RUN chmod -R +x scripts
USER appuser USER appuser
# Run the application # Run the application
ENTRYPOINT ["./scripts/run.sh"] ENTRYPOINT ["./scripts/test-and-run.sh"]

42
oapen-engine/Makefile Normal file
View File

@ -0,0 +1,42 @@
PYTHONEX ?= "python"
PYTHONPATH = "$(CURDIR)/src"
PYTHON = PYTHONPATH="$(PYTHONPATH)" $(PYTHONEX)
setup-env:
ifeq ($(OS),Windows_NT)
py -m pip install --upgrade pip
else
$(PYTHON) -m pip install --upgrade pip
endif
$(PYTHON) -m pip install pipenv
$(PYTHON) -m pipenv install --skip-lock
$(PYTHON) -m pipenv shell
seed_db:
cd src && $(PYTHON) -m pipenv run python tasks/seed.py
clean_db:
cd src && $(PYTHON) -m pipenv run python tasks/clean.py
clean_and_seed:
$(MAKE) clean_db
$(MAKE) seed_db
generate_suggestions:
cd src && $(PYTHON) -m pipenv run python tasks/generate_suggestions.py
run:
$(MAKE) clean_and_seed
$(MAKE) generate_suggestions
run-tests:
cd src && $(PYTHON) -m pipenv run pytest
refresh-items:
cd src && $(PYTHON) -m pipenv run python tasks/refresh_items.py
run-daemon:
cd src && $(PYTHON) -m pipenv run python tasks/daemon.py
run-unit-tests:
cd src && $(PYTHON) -m pipenv run python test/data/run_tests.py

View File

@ -10,6 +10,10 @@ psycopg2 = "2.9.3"
pandas = "*" pandas = "*"
scikit-learn = "*" scikit-learn = "*"
lxml = "*" lxml = "*"
schedule = "*"
charset_normalizer = "*"
idna = "*"
certifi = "*"
[dev-packages] [dev-packages]
pytest = "*" pytest = "*"

0
oapen-engine/scripts/clean.sh Normal file → Executable file
View File

0
oapen-engine/scripts/refresh.sh Normal file → Executable file
View File

View File

@ -1,3 +0,0 @@
#!/bin/sh
python src/tasks/daemon.py

View File

@ -0,0 +1,9 @@
#!/bin/sh
# exit when any command fails
set -e
echo "Running tests..." && \
python src/test/data/run_tests.py && \
echo "Running app" && \
python src/tasks/daemon.py

View File

View File

@ -20,7 +20,6 @@ def get_connection():
cur.close() cur.close()
register_composite("oapen_suggestions.suggestion", conn, globally=True)
register_composite("oapen_suggestions.ngram", conn, globally=True) register_composite("oapen_suggestions.ngram", conn, globally=True)
return conn return conn

View File

@ -32,10 +32,7 @@ class OapenDB:
suggestions = self.deduplicate(suggestions) suggestions = self.deduplicate(suggestions)
cursor = self.connection.cursor() cursor = self.connection.cursor()
args = ",".join( args = ",".join(
cursor.mogrify("(%s,%s,%s::oapen_suggestions.suggestion[])", x).decode( cursor.mogrify("(%s,%s,%s,%s)", x).decode("utf-8") for x in suggestions
"utf-8"
)
for x in suggestions
) )
cursor.close() cursor.close()
return args return args
@ -81,14 +78,13 @@ class OapenDB:
cursor = self.connection.cursor() cursor = self.connection.cursor()
query = """ query = """
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions) INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions)
VALUES (%s, %s, %s::oapen_suggestions.suggestion[]) VALUES (%s, %s, %s, %s)
ON CONFLICT (handle)
DO
UPDATE SET suggestions = excluded.suggestions
""" """
try: try:
cursor.execute(query, (suggestion[0], suggestion[1], suggestion[2])) cursor.execute(
query, (suggestion[0], suggestion[1], suggestion[2], suggestion[3])
)
except (Exception, psycopg2.Error) as error: except (Exception, psycopg2.Error) as error:
logger.error(error) logger.error(error)
finally: finally:
@ -98,11 +94,8 @@ class OapenDB:
cursor = self.connection.cursor() cursor = self.connection.cursor()
args = self.mogrify_suggestions(suggestions) args = self.mogrify_suggestions(suggestions)
query = f""" query = f"""
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestions) INSERT INTO oapen_suggestions.suggestions (handle, name, suggestion, score)
VALUES {args} VALUES {args}
ON CONFLICT (handle)
DO
UPDATE SET suggestions = excluded.suggestions
""" """
try: try:
@ -147,13 +140,17 @@ class OapenDB:
finally: finally:
cursor.close() cursor.close()
def get_all_ngrams(self, ngram_limit=None) -> List[NgramRow]: # get_empty = True -> Include rows with no ngrams in result
def get_all_ngrams(self, get_empty=True) -> List[NgramRow]:
cursor = self.connection.cursor() cursor = self.connection.cursor()
query = """ query = """
SELECT handle, CAST (ngrams AS oapen_suggestions.ngram[]), created_at, updated_at SELECT handle, CAST (ngrams AS oapen_suggestions.ngram[]), created_at, updated_at
FROM oapen_suggestions.ngrams FROM oapen_suggestions.ngrams
""" """
ret = None if not get_empty:
query += """
WHERE ngrams != \'{}\'
"""
try: try:
cursor.execute(query) cursor.execute(query)
records = cursor.fetchall() records = cursor.fetchall()
@ -168,8 +165,7 @@ class OapenDB:
def get_all_suggestions(self) -> List[SuggestionRow]: def get_all_suggestions(self) -> List[SuggestionRow]:
cursor = self.connection.cursor() cursor = self.connection.cursor()
query = """ query = """
SELECT handle, name, CAST (suggestions AS oapen_suggestions.suggestion[]), created_at, updated_at SELECT * FROM oapen_suggestions.suggestions
FROM oapen_suggestions.suggestions
""" """
ret = None ret = None
try: try:
@ -184,6 +180,25 @@ class OapenDB:
cursor.close() cursor.close()
return ret return ret
def get_suggestions_for_item(self, handle) -> List[SuggestionRow]:
cursor = self.connection.cursor()
query = """
SELECT * FROM oapen_suggestions.suggestions
WHERE handle = \'%s\'
"""
ret = None
try:
cursor.execute(query, handle)
records = cursor.fetchall()
ret = records
except (Exception, psycopg2.Error) as error:
logger.error(error)
finally:
cursor.close()
return ret
def count_table(self, table_name) -> int or None: def count_table(self, table_name) -> int or None:
cursor = self.connection.cursor() cursor = self.connection.cursor()
query = "SELECT COUNT(*) FROM %s" query = "SELECT COUNT(*) FROM %s"

View File

@ -2,8 +2,6 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
file_handler = logging.FileHandler("debug.log")
file_handler.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler() stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO) stream_handler.setLevel(logging.INFO)
@ -11,5 +9,5 @@ logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s %(levelname)s %(threadName)s - %(funcName)s: %(message)s", format="%(asctime)s %(levelname)s %(threadName)s - %(funcName)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
handlers=[file_handler, stream_handler], handlers=[stream_handler],
) )

View File

View File

@ -1,37 +1,19 @@
import string import string
from typing import List from typing import List
import pandas as pd # pylint: disable=import-error import nltk
from nltk import word_tokenize # pylint: disable=import-error from nltk import word_tokenize
from nltk.corpus import stopwords # pylint: disable=import-error from .stopwords_processor import STOPWORDS
import pandas as pd
from .oapen_types import ( # pylint: disable=relative-beyond-top-level nltk.download('punkt')
from .oapen_types import (
NgramDict, NgramDict,
NgramRowWithoutDate, NgramRowWithoutDate,
OapenItem, OapenItem,
) )
stopword_paths = [
"src/model/stopwords_broken.txt",
"src/model/stopwords_dutch.txt",
"src/model/stopwords_filter.txt",
"src/model/stopwords_publisher.txt",
]
stopwords_list = []
for p in stopword_paths:
with open(p, "r") as f:
stopwords_list += [line.rstrip() for line in f]
STOPWORDS = (
stopwords.words("english")
+ stopwords.words("german")
+ stopwords.words("dutch")
+ stopwords_list
)
def process_text(text): def process_text(text):
l_text = text.lower() l_text = text.lower()
p_text = "".join([c for c in l_text if c not in string.punctuation]) p_text = "".join([c for c in l_text if c not in string.punctuation])

View File

@ -25,9 +25,8 @@ class OapenItem:
return hash(self.handle, "handle") return hash(self.handle, "handle")
Suggestion = Tuple[str, float] SuggestionRowWithoutDate = Tuple[str, str, str, int]
SuggestionRowWithoutDate = Tuple[str, str, List[Suggestion]] SuggestionRowWithDate = Tuple[str, str, str, int, datetime, datetime]
SuggestionRowWithDate = Tuple[str, str, List[Suggestion], datetime, datetime]
SuggestionRow = Union[SuggestionRowWithDate, SuggestionRowWithoutDate] SuggestionRow = Union[SuggestionRowWithDate, SuggestionRowWithoutDate]
Ngram = Tuple[str, int] Ngram = Tuple[str, int]

View File

@ -0,0 +1,44 @@
import nltk
from nltk.corpus import stopwords
from functools import reduce
import os
# This is run as a precaution in case of the error "NLTK stop words not found",
# which makes sure to download the stop words after installing nltk
nltk.download("stopwords")
# add additional custom stopwords to ./custom_lists/ folder and update the reference here
# print working directory
print("Working directory: " + os.getcwd())
current_dir = os.path.realpath(os.path.dirname(__file__))
print("Local script directory: " + current_dir)
custom_lists_folder = current_dir + "/stopwords/"
custom_stopwords_in_use = [
"broken",
"dutch",
"filter",
"publisher",
]
# For reference on available languages, please reference https://pypi.org/project/stop-words/
enabled_languages = [
"english",
"german",
"dutch"
]
# the combined stopwords of all enabled langauges
nltk_stopwords = []
for language in enabled_languages:
nltk_stopwords += stopwords.words(language)
# get the custom lists
custom_stopwords = []
for custom_list in custom_stopwords_in_use:
with open(custom_lists_folder + custom_list + ".txt", "r") as file: # specify folder name
custom_stopwords += [line.rstrip() for line in file]
# add languages and custom stopwords for final stopwords var
STOPWORDS = (nltk_stopwords + custom_stopwords)

View File

@ -14,7 +14,6 @@ def create_schema(connection) -> None:
""" """
CREATE SCHEMA oapen_suggestions; CREATE SCHEMA oapen_suggestions;
CREATE TYPE oapen_suggestions.suggestion AS (handle text, similarity float);
CREATE TYPE oapen_suggestions.ngram AS (ngram text, count int); CREATE TYPE oapen_suggestions.ngram AS (ngram text, count int);
CREATE OR REPLACE FUNCTION update_modtime() CREATE OR REPLACE FUNCTION update_modtime()
@ -26,11 +25,13 @@ def create_schema(connection) -> None:
$$ language 'plpgsql'; $$ language 'plpgsql';
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions ( CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
handle text PRIMARY KEY, handle text,
name text, name text,
suggestions oapen_suggestions.suggestion[], suggestion text,
score int,
created_at timestamp default current_timestamp, created_at timestamp default current_timestamp,
updated_at timestamp default current_timestamp updated_at timestamp default current_timestamp,
PRIMARY KEY (handle, suggestion)
); );
CREATE TABLE IF NOT EXISTS oapen_suggestions.ngrams ( CREATE TABLE IF NOT EXISTS oapen_suggestions.ngrams (
@ -49,6 +50,12 @@ def create_schema(connection) -> None:
CREATE TRIGGER update_suggestion_modtime BEFORE UPDATE ON oapen_suggestions.suggestions FOR EACH ROW EXECUTE PROCEDURE update_modtime(); CREATE TRIGGER update_suggestion_modtime BEFORE UPDATE ON oapen_suggestions.suggestions FOR EACH ROW EXECUTE PROCEDURE update_modtime();
CREATE TRIGGER update_ngrams_modtime BEFORE UPDATE ON oapen_suggestions.ngrams FOR EACH ROW EXECUTE PROCEDURE update_modtime(); CREATE TRIGGER update_ngrams_modtime BEFORE UPDATE ON oapen_suggestions.ngrams FOR EACH ROW EXECUTE PROCEDURE update_modtime();
CREATE TRIGGER update_endpoint_modtime BEFORE UPDATE ON oapen_suggestions.endpoints FOR EACH ROW EXECUTE PROCEDURE update_modtime(); CREATE TRIGGER update_endpoint_modtime BEFORE UPDATE ON oapen_suggestions.endpoints FOR EACH ROW EXECUTE PROCEDURE update_modtime();
CREATE INDEX idx_suggestion
ON oapen_suggestions.suggestions(handle, suggestion);
ALTER TABLE oapen_suggestions.suggestions
ADD CONSTRAINT uq_Suggestion UNIQUE(handle, suggestion);
""" """
) )
@ -63,7 +70,6 @@ def drop_schema(connection) -> None:
DROP TABLE IF EXISTS oapen_suggestions.suggestions CASCADE; DROP TABLE IF EXISTS oapen_suggestions.suggestions CASCADE;
DROP TABLE IF EXISTS oapen_suggestions.ngrams CASCADE; DROP TABLE IF EXISTS oapen_suggestions.ngrams CASCADE;
DROP TABLE IF EXISTS oapen_suggestions.endpoints CASCADE; DROP TABLE IF EXISTS oapen_suggestions.endpoints CASCADE;
DROP TYPE IF EXISTS oapen_suggestions.suggestion CASCADE;
DROP TYPE IF EXISTS oapen_suggestions.ngram CASCADE; DROP TYPE IF EXISTS oapen_suggestions.ngram CASCADE;
""" """
) )
@ -76,7 +82,15 @@ def get_endpoints(collections):
COLLECTION_IMPORT_LIMIT = int(os.environ["COLLECTION_IMPORT_LIMIT"]) COLLECTION_IMPORT_LIMIT = int(os.environ["COLLECTION_IMPORT_LIMIT"])
SKIPPED_COLLECTIONS = [
"1f7c8abd-677e-4275-8b4e-3d8da49f7b36",
"93223e33-3c7c-47bd-9356-a7878b2814a0",
]
for collection in collections: for collection in collections:
if collection["uuid"] in SKIPPED_COLLECTIONS:
continue
num_items = ( num_items = (
collection["numberItems"] collection["numberItems"]
if COLLECTION_IMPORT_LIMIT == 0 if COLLECTION_IMPORT_LIMIT == 0

View File

@ -16,9 +16,9 @@ SCORE_THRESHOLD = 1
TOP_K_NGRAMS_COUNT = 30 TOP_K_NGRAMS_COUNT = 30
# Number of threads to generate suggestions # Number of threads to generate suggestions
SUGGESTIONS_MAX_WORKERS = 250 SUGGESTIONS_MAX_WORKERS = 10
SUGGESTIONS_MAX_ITEMS = 25 SUGGESTIONS_MAX_ITEMS = 50
# Update items that were modifed since X days ago # Update items that were modifed since X days ago
UPDATE_DAYS_BEFORE = 30 UPDATE_DAYS_BEFORE = 30
REFRESH_IMPORT_LIMIT = 50 REFRESH_IMPORT_LIMIT = 0

View File

@ -4,7 +4,9 @@ import signal
import sys import sys
import time import time
import schedule
from clean import run as run_clean from clean import run as run_clean
from clean import seed_endpoints
from data.connection import get_connection from data.connection import get_connection
from data.oapen_db import OapenDB from data.oapen_db import OapenDB
from generate_suggestions import run as run_generate_suggestions from generate_suggestions import run as run_generate_suggestions
@ -12,8 +14,15 @@ from logger.base_logger import logger
from refresh_items import run as run_refresh_items from refresh_items import run as run_refresh_items
from seed import run as run_seed from seed import run as run_seed
conn = get_connection()
db = OapenDB(conn)
logger.info("Daemon up")
def harvest(): def harvest():
seed_endpoints()
urls = db.get_incomplete_urls()
if len(urls) > 0:
run_seed() run_seed()
run_generate_suggestions() run_generate_suggestions()
@ -23,12 +32,6 @@ def refresh():
run_generate_suggestions() run_generate_suggestions()
logger.info("Daemon up")
conn = get_connection()
db = OapenDB(conn)
def signal_handler(signal, frame): def signal_handler(signal, frame):
conn.close() conn.close()
logger.info("Daemon exiting.") logger.info("Daemon exiting.")
@ -37,29 +40,25 @@ def signal_handler(signal, frame):
signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler)
logger.info("Daemon up")
conn = get_connection()
db = OapenDB(conn)
if int(os.environ["RUN_CLEAN"]) == 1 or ( if int(os.environ["RUN_CLEAN"]) == 1 or (
not db.table_exists("suggestions") or not db.table_exists("ngrams") not db.table_exists("suggestions")
or not db.table_exists("ngrams")
or not db.table_exists("endpoints")
): ):
run_clean() run_clean()
harvest() harvest()
harvest_acc = 0 schedule.every().day.at("20:00").do(refresh)
refresh_acc = 0 schedule.every().sunday.at("22:00").do(harvest)
while True: while True:
if harvest_acc >= int(os.environ["HARVEST_PERIOD"]): schedule.run_pending()
urls = db.get_incomplete_urls()
if len(urls) > 0:
harvest()
harvest_acc = 0
if refresh_acc >= int(os.environ["REFRESH_PERIOD"]):
refresh()
refresh_acc = 0
time.sleep(60) time.sleep(60)
refresh_acc += 60
harvest_acc += 60
logger.info("Daemon down") logger.info("Daemon down")

View File

@ -1,18 +1,15 @@
import concurrent.futures import concurrent.futures
import time import time
from collections import Counter
from threading import Lock from threading import Lock
from typing import List from typing import List
import config import config
import tqdm
from data.connection import close_connection, get_connection from data.connection import close_connection, get_connection
from data.oapen_db import OapenDB from data.oapen_db import OapenDB
from logger.base_logger import logger from logger.base_logger import logger
from model.oapen_types import NgramRow, SuggestionRow from model.oapen_types import NgramRow, SuggestionRow
from tqdm.auto import tqdm
# for each item in ngrams
# get suggestions for item
# store in database
# initial seed -> get suggestions on everything n^2 # initial seed -> get suggestions on everything n^2
# weekly update -> # weekly update ->
@ -21,98 +18,94 @@ from model.oapen_types import NgramRow, SuggestionRow
# optimization: only suggest once per pair # optimization: only suggest once per pair
def suggestion_task(items, all_items, mutex, suggestions): def get_ngrams_list(arr: List[NgramRow]):
return [x[0] for x in arr[1][0 : min(len(arr[1]), config.TOP_K_NGRAMS_COUNT)]]
def suggestion_task(items, all_items, db_mutex, db):
suggestions: List[SuggestionRow] = []
for item_a in items: for item_a in items:
handle_a = item_a[0] handle_a = item_a[0]
ngrams_a = [
x[0] for x in item_a[1][0 : min(len(item_a[1]), config.TOP_K_NGRAMS_COUNT)]
]
item_suggestions = []
for item_b in all_items: for item_b in all_items:
handle_b = item_b[0] handle_b = item_b[0]
ngrams_b = [
x[0]
for x in item_b[1][0 : min(len(item_b[1]), config.TOP_K_NGRAMS_COUNT)]
]
if handle_a == handle_b: if handle_a == handle_b:
continue continue
repeated = len(list(filter(lambda x: x in ngrams_b, ngrams_a))) ngrams_shared = len(list(filter(lambda x: x in item_b[1], item_a[1])))
if repeated >= config.SCORE_THRESHOLD: if ngrams_shared >= config.SCORE_THRESHOLD:
item_suggestions.append((handle_b, repeated)) suggestions.append((handle_a, handle_a, handle_b, ngrams_shared))
mutex.acquire() db_mutex.acquire()
item_suggestions.sort(key=lambda x: x[1], reverse=True) db.add_many_suggestions(suggestions)
mutex.release() db_mutex.release()
suggestions.append((handle_a, handle_a, item_suggestions)) return len(items)
def refresh(future, counter, pbar):
pbar.update(future.result())
counter["items_updated"] += future.result()
pbar.refresh()
def run(): def run():
mutex = Lock()
connection = get_connection() connection = get_connection()
db = OapenDB(connection) db = OapenDB(connection)
all_items: List[NgramRow] = db.get_all_ngrams() all_items: List[NgramRow] = db.get_all_ngrams(get_empty=False)
suggestions: List[SuggestionRow] = []
# Remove any empty entries executor = concurrent.futures.ThreadPoolExecutor(
all_items = list(filter(lambda item: len(item[1]) != 0, all_items))
logger.info("Generating suggestions for {0} items.".format(str(len(all_items))))
futures = []
# Get only top k ngrams for all items before processing
for item in all_items:
item = (
item[0],
[x[0] for x in item[1]][0 : min(len(item[1]), config.TOP_K_NGRAMS_COUNT)],
)
time_start = time.perf_counter()
n = config.SUGGESTIONS_MAX_ITEMS
chunks = [all_items[i : i + n] for i in range(0, len(all_items), n)]
with concurrent.futures.ThreadPoolExecutor(
max_workers=config.SUGGESTIONS_MAX_WORKERS max_workers=config.SUGGESTIONS_MAX_WORKERS
) as executor:
for chunk in chunks:
future = executor.submit(
suggestion_task, chunk, all_items, mutex, suggestions
) )
futures.append(future) futures = []
db_mutex = Lock()
with tqdm.tqdm( counter = Counter(items_updated=0)
total=len(futures),
pbar = tqdm(
total=len(all_items),
mininterval=0, mininterval=0,
miniters=1, miniters=1,
leave=True, leave=True,
position=0, position=0,
initial=0, initial=0,
) as pbar: )
logger.info("Getting suggestions for {0} items...".format(str(len(all_items))))
time_start = time.perf_counter()
# Get only top k ngrams for all items before processing
for item in all_items:
ngrams = get_ngrams_list(item)
item = (item[0], ngrams)
chunks = [
all_items[i : i + config.SUGGESTIONS_MAX_ITEMS]
for i in range(0, len(all_items), config.SUGGESTIONS_MAX_ITEMS)
]
for chunk in chunks:
future = executor.submit(suggestion_task, chunk, all_items, db_mutex, db)
future.add_done_callback(lambda x: refresh(x, counter, pbar))
futures.append(future)
for future in concurrent.futures.as_completed(futures): for future in concurrent.futures.as_completed(futures):
future.result() pass
pbar.update(1)
db.add_many_suggestions(suggestions)
logger.info( logger.info(
"Updated suggestions for " "Updated "
+ str(len(all_items)) + str(counter["items_updated"])
+ " items in " + " suggestions in "
+ str(time.perf_counter() - time_start) + str(time.perf_counter() - time_start)
+ "s." + "s."
) )
executor.shutdown(wait=True)
pbar.close()
close_connection(connection) close_connection(connection)

View File

@ -0,0 +1,41 @@
import test_oapen
import test_stopwords
import test_ngrams
def run_test(run_msg, func):
print(run_msg, end = " ")
func()
print("OK") # will throw on fail
def main():
print("Testing connection to OAPEN.")
try:
run_test("Attempting to get item [Embodying Contagion]:", test_oapen.test_get_item)
run_test("Attempting to get null item:", test_oapen.test_get_item_404)
run_test("Attempting to get collection limit by label [Knowledge Unlatched (KU)]:",
test_oapen.test_get_collection_limit)
run_test("Attempting to get null collection:", test_oapen.test_get_collection_404)
except Exception as e:
print("\nFailed:")
print(e)
print("\nTesting stopwords generation.")
try:
run_test("Testing stopwords correctly generated:",
test_stopwords.test_stopwords_contains_all)
except Exception as e:
print("Failed:")
print(e)
print("\nTesting ngrams functionality.")
try:
run_test("Testing process_text:", test_ngrams.test_process_text)
run_test("Testing ngram generation:", test_ngrams.test_generate_ngram)
run_test("Testing similarity score:", test_ngrams.test_similarity_score)
except Exception as e:
print("Failed:")
print(e)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,51 @@
import model.ngrams as ngrams
test_text1 = "Foxes are cunning animals. There was a quick, red fox known to avoid crossing roads during the day, doing so only at night."
test_text2 = "The quick red fox jumped over the lazy brown dog. It had a fantastic time doing so, as it felt finally free. The fox had been in the zoo for far too long, held in captivity."
processed_text1 = ['foxes', 'cunning', 'animals', 'quick', 'red', 'fox', 'known', 'avoid', 'crossing', 'roads', 'day', 'night']
processed_text2 = ['quick', 'red', 'fox', 'jumped', 'lazy', 'brown', 'dog', 'fantastic', 'time', 'felt', 'finally', 'free', 'fox', 'zoo', 'far', 'long', 'held', 'captivity']
ngrams1 = {
'foxes cunning animals': 1,
'cunning animals quick': 1,
'animals quick red': 1,
'quick red fox': 1,
'red fox known': 1,
'fox known avoid': 1,
'known avoid crossing': 1,
'avoid crossing roads': 1,
'crossing roads day': 1,
'roads day night': 1
}
ngrams2 = {
'quick red fox': 1,
'red fox jumped': 1,
'fox jumped lazy': 1,
'jumped lazy brown': 1,
'lazy brown dog': 1,
'brown dog fantastic': 1,
'dog fantastic time': 1,
'fantastic time felt': 1,
'time felt finally': 1,
'felt finally free': 1,
'finally free fox': 1,
'free fox zoo': 1,
'fox zoo far': 1,
'zoo far long': 1,
'far long held': 1,
'long held captivity': 1
}
def test_process_text():
assert(ngrams.process_text(test_text1) == processed_text1)
assert(ngrams.process_text(test_text2) == processed_text2)
def test_generate_ngram():
assert(ngrams.generate_ngram(processed_text1) == ngrams1)
assert(ngrams.generate_ngram(processed_text2) == ngrams2)
def test_similarity_score():
assert(ngrams.get_similarity_score(ngrams1, ngrams2, n=5, as_percent=False) == 1)
assert(ngrams.get_similarity_score(ngrams1, ngrams2, n=5, as_percent=True) == 0.2)

View File

@ -1,6 +1,6 @@
from typing import List from typing import List
import src.data.oapen as OapenAPI import data.oapen as OapenAPI
from model.oapen_types import OapenItem from model.oapen_types import OapenItem

View File

@ -0,0 +1,23 @@
from model.stopwords_processor import STOPWORDS
import model.stopwords.stopwords_full_list as stopwords_full_list
# currently contains stopwords_filter, stopwords_publisher, stopwords_broken, stopwords_dutch_extra
# tests all at once
def test_stopwords_contains_all():
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_filter))
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_publisher))
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_broken))
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_dutch_extra))
# individual tests provided if needed
def test_stopwords_contains_stopwords_filter():
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_filter))
def test_stopwords_contains_stopwords_publisher():
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_publisher))
def test_stopwords_contains_stopwords_broken():
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_broken))
def test_stopwords_contains_stopwords_dutch_extra():
assert(all(x in STOPWORDS for x in stopwords_full_list.stopwords_dutch_extra))

0
run-api.sh Normal file → Executable file
View File

0
run-web.sh Normal file → Executable file
View File