celinanperalta/OAP 65 (#54)
Add additional parameters to OAPEN DB and refactor engine typespull/57/head
parent
3f05e94e67
commit
7e33ac4677
2
.env
2
.env
|
@ -6,4 +6,4 @@ POSTGRES_PORT=5432
|
||||||
POSTGRES_DB_NAME=postgres
|
POSTGRES_DB_NAME=postgres
|
||||||
POSTGRES_USERNAME=postgres
|
POSTGRES_USERNAME=postgres
|
||||||
POSTGRES_PASSWORD=postgrespw
|
POSTGRES_PASSWORD=postgrespw
|
||||||
POSTGRES_SSLMODE=require
|
POSTGRES_SSLMODE=require
|
||||||
|
|
|
@ -1,40 +1,36 @@
|
||||||
|
import random
|
||||||
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from logger.base_logger import logger
|
from logger.base_logger import logger
|
||||||
from model.oapen_types import OapenItem, transform_item_data
|
from model.oapen_types import OapenItem
|
||||||
|
|
||||||
SERVER_PATH = "https://library.oapen.org"
|
SERVER_PATH = "https://library.oapen.org"
|
||||||
GET_COMMUNITY = "/rest/communities/{id}"
|
|
||||||
GET_COLLECTION = "/rest/collections/{id}"
|
|
||||||
GET_COLLECTIONS = "/rest/collections/"
|
GET_COLLECTIONS = "/rest/collections/"
|
||||||
GET_ITEM_BITSTREAMS = "/rest/items/{id}/bitstreams"
|
GET_ITEM_BITSTREAMS = "/rest/items/{id}/bitstreams"
|
||||||
GET_COLLECTION_ITEMS = "/rest/collections/{id}/items"
|
GET_COLLECTION_ITEMS = "/rest/collections/{id}/items"
|
||||||
GET_COMMUNITY_COLLECTIONS = "/rest/communities/{id}/collections"
|
|
||||||
GET_ITEM = "/rest/search?query=handle:%22{handle}%22&expand=bitstreams,metadata"
|
|
||||||
GET_COLLECTION_BY_LABEL = (
|
|
||||||
"/rest/search?query=oapen.collection:%22{label}%22&expand=metadata"
|
|
||||||
)
|
|
||||||
|
|
||||||
GET_WEEKLY_ITEMS = "/rest/search?query=dc.date.accessioned_dt:[NOW-7DAY/DAY+TO+NOW]&expand=bitstreams,metadata"
|
GET_WEEKLY_ITEMS = "/rest/search?query=dc.date.accessioned_dt:[NOW-7DAY/DAY+TO+NOW]&expand=bitstreams,metadata"
|
||||||
GET_UPDATED_ITEMS = (
|
GET_UPDATED_ITEMS = (
|
||||||
"/rest/search?query=lastModified%3E{date}&expand=metadata,bitsteams" # YYYY-MM-DD
|
"/rest/search?query=lastModified%3E{date}&expand=metadata,bitsteams" # YYYY-MM-DD
|
||||||
)
|
)
|
||||||
|
|
||||||
# This is the only community we care about right now
|
USER_AGENTS = [
|
||||||
BOOKS_COMMUNITY_ID = "3579505d-9d1b-4745-bcaf-a37329d25c69"
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
||||||
|
]
|
||||||
|
|
||||||
GET_HEADERS = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
|
def transform_item_data(item) -> OapenItem:
|
||||||
}
|
thumbnail = get_bitstream_thumbnail(item)
|
||||||
|
text = get_bitstream_text(item)
|
||||||
|
return OapenItem(item["handle"], item["name"], thumbnail, text)
|
||||||
|
|
||||||
|
|
||||||
def transform_multiple_items_data(items) -> List[OapenItem]:
|
def transform_multiple_items_data(items) -> List[OapenItem]:
|
||||||
return [
|
return [transform_item_data(item) for item in items]
|
||||||
transform_item_data(item, get_bitstream_text(item["bitstreams"]))
|
|
||||||
for item in items
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def get(endpoint, params=None):
|
def get(endpoint, params=None):
|
||||||
|
@ -43,7 +39,7 @@ def get(endpoint, params=None):
|
||||||
url=SERVER_PATH + endpoint,
|
url=SERVER_PATH + endpoint,
|
||||||
params=params,
|
params=params,
|
||||||
timeout=(None, 120),
|
timeout=(None, 120),
|
||||||
headers=GET_HEADERS,
|
headers={"User-Agent": random.choice(USER_AGENTS)},
|
||||||
)
|
)
|
||||||
|
|
||||||
ret = None
|
ret = None
|
||||||
|
@ -54,41 +50,19 @@ def get(endpoint, params=None):
|
||||||
ret = res.content
|
ret = res.content
|
||||||
else:
|
else:
|
||||||
logger.error("ERROR - GET {}: {}".format(res.url, res.status_code))
|
logger.error("ERROR - GET {}: {}".format(res.url, res.status_code))
|
||||||
|
|
||||||
logger.debug("GET {}: {}".format(res.url, res.status_code))
|
logger.debug("GET {}: {}".format(res.url, res.status_code))
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def get_all_communities():
|
|
||||||
return get(endpoint=GET_COMMUNITY)
|
|
||||||
|
|
||||||
|
|
||||||
def get_community(id):
|
|
||||||
return get(endpoint=GET_COMMUNITY.format(id=id))
|
|
||||||
|
|
||||||
|
|
||||||
def get_collection(id):
|
|
||||||
return get(endpoint=GET_COLLECTION.format(id=id))
|
|
||||||
|
|
||||||
|
|
||||||
def get_item(handle) -> OapenItem:
|
|
||||||
res = get(endpoint=GET_ITEM.format(handle=handle))
|
|
||||||
|
|
||||||
if res is not None and len(res) > 0:
|
|
||||||
return transform_item_data(res[0], get_bitstream_text(res[0]["bitstreams"]))
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def get_collections_from_community(id):
|
|
||||||
res = get(endpoint=GET_COMMUNITY_COLLECTIONS.format(id=id))
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_collections():
|
def get_all_collections():
|
||||||
res = get(endpoint=GET_COLLECTIONS)
|
res = get(endpoint=GET_COLLECTIONS)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# i.e. /rest/collections/2154c0ca-7814-4cc8-a869-3de4215c4121/items?limit=25&offset=0&expand=bitstreams,metadata
|
||||||
|
# This is a redundancy of get_collection_items_by_id meant to be used with endpoints that
|
||||||
|
# are constructed in tasks/clean.py and cached in the database. This is not a high priority
|
||||||
|
# issue, but it certainly can be cleaned up in the future.
|
||||||
def get_collection_items_by_endpoint(endpoint) -> List[OapenItem]:
|
def get_collection_items_by_endpoint(endpoint) -> List[OapenItem]:
|
||||||
res = get(endpoint=endpoint)
|
res = get(endpoint=endpoint)
|
||||||
|
|
||||||
|
@ -108,39 +82,15 @@ def get_collection_items_by_id(id, limit=None, offset=None) -> List[OapenItem]:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def get_collection_items_by_label(label, limit=None) -> List[OapenItem]:
|
|
||||||
label = "+".join(label.split(" "))
|
|
||||||
res = get(
|
|
||||||
endpoint=GET_COLLECTION_BY_LABEL.format(label=label),
|
|
||||||
params={"limit": limit},
|
|
||||||
)
|
|
||||||
|
|
||||||
if res is not None and len(res) > 0:
|
|
||||||
return transform_multiple_items_data(res)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def get_bitstream_text(bitstreams, limit=None) -> str:
|
|
||||||
if bitstreams is not None:
|
|
||||||
for bitstream in bitstreams:
|
|
||||||
if bitstream["mimeType"] == "text/plain":
|
|
||||||
retrieveLink = bitstream["retrieveLink"]
|
|
||||||
text = str(get(retrieveLink).decode("utf-8"))
|
|
||||||
return text if limit is None else text[:limit]
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
# Gets all items added in the last week
|
# Gets all items added in the last week
|
||||||
def get_weekly_items(limit=None) -> List[OapenItem]:
|
def get_weekly_items(limit=None) -> List[OapenItem]:
|
||||||
res = get(endpoint=GET_WEEKLY_ITEMS, params={"limit": limit})
|
res = get(endpoint=GET_WEEKLY_ITEMS, params={"limit": limit})
|
||||||
|
|
||||||
if res is not None and len(res) > 0:
|
if res is not None and len(res) > 0:
|
||||||
return transform_multiple_items_data(res)
|
return transform_multiple_items_data(res)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def get_updated_items(date: datetime, limit=None, offset=None) -> List[OapenItem]:
|
def get_updated_items(date: datetime, limit=None, offset=None) -> List[OapenItem]:
|
||||||
|
|
||||||
date = date.strftime("%Y-%m-%d")
|
date = date.strftime("%Y-%m-%d")
|
||||||
res = get(
|
res = get(
|
||||||
endpoint=GET_UPDATED_ITEMS.format(date=date),
|
endpoint=GET_UPDATED_ITEMS.format(date=date),
|
||||||
|
@ -150,3 +100,27 @@ def get_updated_items(date: datetime, limit=None, offset=None) -> List[OapenItem
|
||||||
if res is not None and len(res) > 0:
|
if res is not None and len(res) > 0:
|
||||||
return transform_multiple_items_data(res)
|
return transform_multiple_items_data(res)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# General function to extract the retrieveLink of a bitstream with type bitstream_type
|
||||||
|
def __get_bitstream_url(bitstreams, bundle_name: str) -> str or None:
|
||||||
|
if bitstreams is not None:
|
||||||
|
for bitstream in bitstreams:
|
||||||
|
if bitstream["bundleName"] == bundle_name:
|
||||||
|
return bitstream["retrieveLink"]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# limit: int - number of characters
|
||||||
|
def get_bitstream_text(item) -> str:
|
||||||
|
retrieveLink = __get_bitstream_url(item["bitstreams"], "TEXT")
|
||||||
|
if retrieveLink is not None:
|
||||||
|
time.sleep(1) # Attempt to avoid rate limiting
|
||||||
|
text = str(get(retrieveLink).decode("utf-8"))
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_bitstream_thumbnail(item):
|
||||||
|
return __get_bitstream_url(item["bitstreams"], "THUMBNAIL")
|
||||||
|
|
|
@ -13,16 +13,19 @@ class OapenDB:
|
||||||
seen = set()
|
seen = set()
|
||||||
res = []
|
res = []
|
||||||
for i in items:
|
for i in items:
|
||||||
if i[0] not in seen:
|
if i not in seen:
|
||||||
res.append(i)
|
res.append(i)
|
||||||
seen.add(i[0])
|
seen.add(i)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def mogrify_ngrams(self, ngrams: List[NgramRow]) -> str:
|
def mogrify_ngrams(self, ngrams: List[NgramRow]) -> str:
|
||||||
ngrams = self.deduplicate(ngrams)
|
ngrams = self.deduplicate(ngrams)
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
args = ",".join(
|
args = ",".join(
|
||||||
cursor.mogrify("(%s,%s::oapen_suggestions.ngram[])", x).decode("utf-8")
|
cursor.mogrify(
|
||||||
|
"(%s,%s,%s,%s::oapen_suggestions.ngram[])",
|
||||||
|
(x.handle, x.name, x.thumbnail, x.ngrams),
|
||||||
|
).decode("utf-8")
|
||||||
for x in ngrams
|
for x in ngrams
|
||||||
)
|
)
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
@ -32,7 +35,17 @@ class OapenDB:
|
||||||
suggestions = self.deduplicate(suggestions)
|
suggestions = self.deduplicate(suggestions)
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
args = ",".join(
|
args = ",".join(
|
||||||
cursor.mogrify("(%s,%s,%s,%s)", x).decode("utf-8") for x in suggestions
|
cursor.mogrify(
|
||||||
|
"(%s,%s,%s,%s,%s)",
|
||||||
|
(
|
||||||
|
x.handle,
|
||||||
|
x.suggestion,
|
||||||
|
x.suggestion_name,
|
||||||
|
x.suggestion_thumbnail,
|
||||||
|
x.score,
|
||||||
|
),
|
||||||
|
).decode("utf-8")
|
||||||
|
for x in suggestions
|
||||||
)
|
)
|
||||||
cursor.close()
|
cursor.close()
|
||||||
return args
|
return args
|
||||||
|
@ -93,11 +106,11 @@ class OapenDB:
|
||||||
def add_many_suggestions(self, suggestions: List[SuggestionRow]) -> None:
|
def add_many_suggestions(self, suggestions: List[SuggestionRow]) -> None:
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
args = self.mogrify_suggestions(suggestions)
|
args = self.mogrify_suggestions(suggestions)
|
||||||
|
|
||||||
query = f"""
|
query = f"""
|
||||||
INSERT INTO oapen_suggestions.suggestions (handle, name, suggestion, score)
|
INSERT INTO oapen_suggestions.suggestions
|
||||||
VALUES {args}
|
VALUES {args}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
except (Exception, psycopg2.Error) as error:
|
except (Exception, psycopg2.Error) as error:
|
||||||
|
@ -127,13 +140,12 @@ class OapenDB:
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
args = self.mogrify_ngrams(ngrams)
|
args = self.mogrify_ngrams(ngrams)
|
||||||
query = f"""
|
query = f"""
|
||||||
INSERT INTO oapen_suggestions.ngrams (handle, ngrams)
|
INSERT INTO oapen_suggestions.ngrams
|
||||||
VALUES {args}
|
VALUES {args}
|
||||||
ON CONFLICT (handle)
|
ON CONFLICT (handle)
|
||||||
DO
|
DO
|
||||||
UPDATE SET ngrams = excluded.ngrams
|
UPDATE SET ngrams = excluded.ngrams
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
except (Exception, psycopg2.Error) as error:
|
except (Exception, psycopg2.Error) as error:
|
||||||
logger.error(error)
|
logger.error(error)
|
||||||
|
@ -144,7 +156,7 @@ class OapenDB:
|
||||||
def get_all_ngrams(self, get_empty=True) -> List[NgramRow]:
|
def get_all_ngrams(self, get_empty=True) -> List[NgramRow]:
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
query = """
|
query = """
|
||||||
SELECT handle, CAST (ngrams AS oapen_suggestions.ngram[]), created_at, updated_at
|
SELECT handle, name, thumbnail, CAST (ngrams AS oapen_suggestions.ngram[]), created_at, updated_at
|
||||||
FROM oapen_suggestions.ngrams
|
FROM oapen_suggestions.ngrams
|
||||||
"""
|
"""
|
||||||
if not get_empty:
|
if not get_empty:
|
||||||
|
@ -154,7 +166,7 @@ class OapenDB:
|
||||||
try:
|
try:
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
records = cursor.fetchall()
|
records = cursor.fetchall()
|
||||||
ret = records
|
ret = [NgramRow(*record) for record in records]
|
||||||
|
|
||||||
except (Exception, psycopg2.Error) as error:
|
except (Exception, psycopg2.Error) as error:
|
||||||
logger.error(error)
|
logger.error(error)
|
||||||
|
@ -172,6 +184,25 @@ class OapenDB:
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
records = cursor.fetchall()
|
records = cursor.fetchall()
|
||||||
|
|
||||||
|
ret = [SuggestionRow(*record) for record in records]
|
||||||
|
|
||||||
|
except (Exception, psycopg2.Error) as error:
|
||||||
|
logger.error(error)
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def get_suggestions_for_item(self, handle) -> List[SuggestionRow]:
|
||||||
|
cursor = self.connection.cursor()
|
||||||
|
query = """
|
||||||
|
SELECT * FROM oapen_suggestions.suggestions
|
||||||
|
WHERE handle = \'%s\'
|
||||||
|
"""
|
||||||
|
ret = None
|
||||||
|
try:
|
||||||
|
cursor.execute(query, handle)
|
||||||
|
records = cursor.fetchall()
|
||||||
|
|
||||||
ret = records
|
ret = records
|
||||||
|
|
||||||
except (Exception, psycopg2.Error) as error:
|
except (Exception, psycopg2.Error) as error:
|
||||||
|
@ -251,7 +282,7 @@ class OapenDB:
|
||||||
cursor.execute(query)
|
cursor.execute(query)
|
||||||
records = cursor.fetchall()
|
records = cursor.fetchall()
|
||||||
|
|
||||||
ret = records
|
ret = [UrlRow(*record) for record in records]
|
||||||
|
|
||||||
except (Exception, psycopg2.Error) as error:
|
except (Exception, psycopg2.Error) as error:
|
||||||
logger.error(error)
|
logger.error(error)
|
||||||
|
|
|
@ -2,6 +2,12 @@ import re
|
||||||
import string
|
import string
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from .oapen_types import ( # pylint: disable=relative-beyond-top-level
|
||||||
|
NgramDict,
|
||||||
|
NgramRow,
|
||||||
|
OapenItem,
|
||||||
|
)
|
||||||
|
|
||||||
import nltk
|
import nltk
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
|
@ -10,9 +16,6 @@ from .stopwords_processor import STOPWORDS
|
||||||
|
|
||||||
nltk.download("punkt")
|
nltk.download("punkt")
|
||||||
|
|
||||||
from .oapen_types import NgramDict, NgramRowWithoutDate, OapenItem
|
|
||||||
|
|
||||||
|
|
||||||
def process_text(text):
|
def process_text(text):
|
||||||
p_text = "".join([c for c in text.lower() if c not in string.punctuation])
|
p_text = "".join([c for c in text.lower() if c not in string.punctuation])
|
||||||
stopwords_regex = re.compile(r"\b%s\b" % r"\b|\b".join(map(re.escape, STOPWORDS)))
|
stopwords_regex = re.compile(r"\b%s\b" % r"\b|\b".join(map(re.escape, STOPWORDS)))
|
||||||
|
@ -24,19 +27,6 @@ def process_text(text):
|
||||||
return filtered_words
|
return filtered_words
|
||||||
|
|
||||||
|
|
||||||
def make_df(data: List[OapenItem]):
|
|
||||||
df = pd.DataFrame(columns=["handle", "name", "lang", "text"])
|
|
||||||
|
|
||||||
for item in data:
|
|
||||||
text = process_text(item.text)
|
|
||||||
df.loc[len(df.index)] = [item.handle, item.name, item.language, text]
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_by_handle(df, handle):
|
|
||||||
return df.loc[df.handle == handle].text[0]
|
|
||||||
|
|
||||||
|
|
||||||
def sort_ngrams_by_count(ngrams: NgramDict):
|
def sort_ngrams_by_count(ngrams: NgramDict):
|
||||||
return sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
return sorted(ngrams.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
|
||||||
|
@ -51,11 +41,6 @@ def generate_ngram(text, n=3) -> NgramDict:
|
||||||
return dict(sort_ngrams_by_count(ngrams)) # return sorted by count
|
return dict(sort_ngrams_by_count(ngrams)) # return sorted by count
|
||||||
|
|
||||||
|
|
||||||
def generate_ngram_by_handle(df, handle, n=3) -> NgramDict:
|
|
||||||
text = get_text_by_handle(df, handle)
|
|
||||||
return generate_ngram(text, n)
|
|
||||||
|
|
||||||
|
|
||||||
def get_n_most_occuring(dic: NgramDict, n=100):
|
def get_n_most_occuring(dic: NgramDict, n=100):
|
||||||
sorted_dict = dict(
|
sorted_dict = dict(
|
||||||
sort_ngrams_by_count(dic)
|
sort_ngrams_by_count(dic)
|
||||||
|
@ -93,12 +78,16 @@ def get_similarity_score_by_dict_count(ngrams1: NgramDict, ngrams2: NgramDict) -
|
||||||
return repeated / total
|
return repeated / total
|
||||||
|
|
||||||
|
|
||||||
def get_ngrams_for_items(
|
def get_ngrams_for_items(items: List[OapenItem], n=3, ngram_limit=30) -> List[NgramRow]:
|
||||||
items: List[OapenItem], n=3, ngram_limit=30
|
|
||||||
) -> List[NgramRowWithoutDate]:
|
|
||||||
rows = []
|
rows = []
|
||||||
for item in items:
|
for item in items:
|
||||||
text = process_text(item.text)
|
text = process_text(item.text)
|
||||||
ngrams = generate_ngram(text, n)
|
ngrams = generate_ngram(text, n)
|
||||||
rows.append((item.handle, list(sort_ngrams_by_count(ngrams))[0:ngram_limit]))
|
row = NgramRow(
|
||||||
|
handle=item.handle,
|
||||||
|
name=item.name,
|
||||||
|
thumbnail=item.thumbnail,
|
||||||
|
ngrams=list(sort_ngrams_by_count(ngrams))[0:ngram_limit],
|
||||||
|
)
|
||||||
|
rows.append(row)
|
||||||
return rows
|
return rows
|
||||||
|
|
|
@ -1,22 +1,12 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List, Tuple, Union
|
from typing import Dict, List, NamedTuple
|
||||||
|
|
||||||
|
|
||||||
class OapenItem:
|
class OapenItem(NamedTuple):
|
||||||
def __init__(
|
handle: str
|
||||||
self, uuid, name, handle, expand, link, metadata, bitstreams, text: str
|
name: str
|
||||||
):
|
thumbnail: str
|
||||||
self.uuid = uuid
|
text: str
|
||||||
self.name = name
|
|
||||||
self.handle = handle
|
|
||||||
self.expand = expand
|
|
||||||
self.link = link
|
|
||||||
self.metadata = metadata
|
|
||||||
self.bitstreams = bitstreams
|
|
||||||
|
|
||||||
language = list(filter(lambda x: x["key"] == "dc.language", self.metadata))
|
|
||||||
self.language = None if len(language) == 0 else language[0]["value"]
|
|
||||||
self.text = text
|
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.handle == other.handle
|
return self.handle == other.handle
|
||||||
|
@ -24,29 +14,44 @@ class OapenItem:
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return hash(self.handle, "handle")
|
return hash(self.handle, "handle")
|
||||||
|
|
||||||
|
class SuggestionRow(NamedTuple):
|
||||||
|
handle: str
|
||||||
|
suggestion: str
|
||||||
|
suggestion_name: str
|
||||||
|
suggestion_thumbnail: str
|
||||||
|
score: int
|
||||||
|
created_at: datetime = None
|
||||||
|
updated_at: datetime = None
|
||||||
|
|
||||||
SuggestionRowWithoutDate = Tuple[str, str, str, int]
|
def __eq__(self, other):
|
||||||
SuggestionRowWithDate = Tuple[str, str, str, int, datetime, datetime]
|
return self.handle == other.handle and self.suggestion == other.suggestion
|
||||||
SuggestionRow = Union[SuggestionRowWithDate, SuggestionRowWithoutDate]
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return hash((self.handle, self.suggestion))
|
||||||
|
|
||||||
|
class Ngram(NamedTuple):
|
||||||
|
ngram: str
|
||||||
|
count: int
|
||||||
|
|
||||||
|
class NgramRow(NamedTuple):
|
||||||
|
handle: str
|
||||||
|
name: str
|
||||||
|
thumbnail: str
|
||||||
|
ngrams: List[Ngram]
|
||||||
|
created_at: datetime = None
|
||||||
|
updated_at: datetime = None
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self.handle == other.handle
|
||||||
|
|
||||||
|
def __hash__(self) -> int:
|
||||||
|
return hash(self.handle)
|
||||||
|
|
||||||
Ngram = Tuple[str, int]
|
|
||||||
NgramRowWithoutDate = Tuple[str, List[Ngram]]
|
|
||||||
NgramRowWithDate = Tuple[str, List[Ngram], datetime, datetime]
|
|
||||||
NgramRow = Union[NgramRowWithDate, NgramRowWithoutDate]
|
|
||||||
|
|
||||||
NgramDict = Dict[str, int]
|
NgramDict = Dict[str, int]
|
||||||
|
|
||||||
UrlRow = Tuple[str, bool]
|
|
||||||
|
|
||||||
|
class UrlRow(NamedTuple):
|
||||||
def transform_item_data(item, text) -> OapenItem:
|
url: str
|
||||||
return OapenItem(
|
completed: bool
|
||||||
item["uuid"],
|
updated_at: datetime = None
|
||||||
item["name"],
|
|
||||||
item["handle"],
|
|
||||||
item["expand"],
|
|
||||||
item["link"],
|
|
||||||
item["metadata"],
|
|
||||||
item["bitstreams"],
|
|
||||||
text,
|
|
||||||
)
|
|
||||||
|
|
|
@ -25,17 +25,20 @@ def create_schema(connection) -> None:
|
||||||
$$ language 'plpgsql';
|
$$ language 'plpgsql';
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
|
CREATE TABLE IF NOT EXISTS oapen_suggestions.suggestions (
|
||||||
handle text,
|
handle text,
|
||||||
name text,
|
suggestion text,
|
||||||
suggestion text,
|
suggestion_name text,
|
||||||
score int,
|
suggestion_thumbnail text,
|
||||||
created_at timestamp default current_timestamp,
|
score int,
|
||||||
updated_at timestamp default current_timestamp,
|
created_at timestamp default current_timestamp,
|
||||||
|
updated_at timestamp default current_timestamp,
|
||||||
PRIMARY KEY (handle, suggestion)
|
PRIMARY KEY (handle, suggestion)
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS oapen_suggestions.ngrams (
|
CREATE TABLE IF NOT EXISTS oapen_suggestions.ngrams (
|
||||||
handle text PRIMARY KEY,
|
handle text PRIMARY KEY,
|
||||||
|
name text,
|
||||||
|
thumbnail text,
|
||||||
ngrams oapen_suggestions.ngram[],
|
ngrams oapen_suggestions.ngram[],
|
||||||
created_at timestamp default current_timestamp,
|
created_at timestamp default current_timestamp,
|
||||||
updated_at timestamp default current_timestamp
|
updated_at timestamp default current_timestamp
|
||||||
|
@ -63,6 +66,7 @@ def create_schema(connection) -> None:
|
||||||
|
|
||||||
|
|
||||||
def drop_schema(connection) -> None:
|
def drop_schema(connection) -> None:
|
||||||
|
logger.warn("WARNING: DROPPING DATABASE!")
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -4,7 +4,7 @@ ITEMS_PER_IMPORT_THREAD = 25
|
||||||
IO_MAX_WORKERS = 5
|
IO_MAX_WORKERS = 5
|
||||||
|
|
||||||
# Delay for submitting new API call thread
|
# Delay for submitting new API call thread
|
||||||
HARVEST_THREAD_SPAWN_DELAY = 2
|
HARVEST_THREAD_SPAWN_DELAY = 5
|
||||||
|
|
||||||
# Size of list of items to process into ngrams per process
|
# Size of list of items to process into ngrams per process
|
||||||
NGRAMS_PER_INSERT = 100
|
NGRAMS_PER_INSERT = 100
|
||||||
|
|
|
@ -7,7 +7,7 @@ import time
|
||||||
import schedule
|
import schedule
|
||||||
from clean import run as run_clean
|
from clean import run as run_clean
|
||||||
from clean import seed_endpoints
|
from clean import seed_endpoints
|
||||||
from data.connection import get_connection
|
from data.connection import close_connection, get_connection
|
||||||
from data.oapen_db import OapenDB
|
from data.oapen_db import OapenDB
|
||||||
from generate_suggestions import run as run_generate_suggestions
|
from generate_suggestions import run as run_generate_suggestions
|
||||||
from logger.base_logger import logger
|
from logger.base_logger import logger
|
||||||
|
@ -16,12 +16,16 @@ from seed import run as run_seed
|
||||||
|
|
||||||
conn = get_connection()
|
conn = get_connection()
|
||||||
db = OapenDB(conn)
|
db = OapenDB(conn)
|
||||||
logger.info("Daemon up")
|
|
||||||
|
|
||||||
|
|
||||||
def harvest():
|
def harvest():
|
||||||
|
conn = get_connection()
|
||||||
|
db = OapenDB(conn)
|
||||||
seed_endpoints(conn)
|
seed_endpoints(conn)
|
||||||
urls = db.get_incomplete_urls()
|
urls = db.get_incomplete_urls()
|
||||||
|
|
||||||
|
close_connection(conn)
|
||||||
|
|
||||||
if len(urls) > 0:
|
if len(urls) > 0:
|
||||||
run_seed()
|
run_seed()
|
||||||
run_generate_suggestions()
|
run_generate_suggestions()
|
||||||
|
@ -32,33 +36,31 @@ def refresh():
|
||||||
run_generate_suggestions()
|
run_generate_suggestions()
|
||||||
|
|
||||||
|
|
||||||
def signal_handler(signal, frame):
|
def main():
|
||||||
conn.close()
|
def signal_handler(signal, frame):
|
||||||
logger.info("Daemon exiting.")
|
logger.info("Daemon exiting.")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
|
logger.info("Daemon up")
|
||||||
|
|
||||||
|
if int(os.environ["RUN_CLEAN"]) == 1 or (
|
||||||
|
not db.table_exists("suggestions")
|
||||||
|
or not db.table_exists("ngrams")
|
||||||
|
or not db.table_exists("endpoints")
|
||||||
|
):
|
||||||
|
run_clean()
|
||||||
|
|
||||||
|
harvest()
|
||||||
|
|
||||||
|
schedule.every().day.at("20:00").do(refresh)
|
||||||
|
schedule.every().sunday.at("22:00").do(harvest)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
schedule.run_pending()
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
|
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
logger.info("Daemon up")
|
|
||||||
|
|
||||||
conn = get_connection()
|
|
||||||
db = OapenDB(conn)
|
|
||||||
|
|
||||||
if int(os.environ["RUN_CLEAN"]) == 1 or (
|
|
||||||
not db.table_exists("suggestions")
|
|
||||||
or not db.table_exists("ngrams")
|
|
||||||
or not db.table_exists("endpoints")
|
|
||||||
):
|
|
||||||
run_clean()
|
|
||||||
|
|
||||||
harvest()
|
|
||||||
|
|
||||||
schedule.every().day.at("20:00").do(refresh)
|
|
||||||
schedule.every().sunday.at("22:00").do(harvest)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
schedule.run_pending()
|
|
||||||
time.sleep(60)
|
|
||||||
|
|
||||||
logger.info("Daemon down")
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ import config
|
||||||
from data.connection import close_connection, get_connection
|
from data.connection import close_connection, get_connection
|
||||||
from data.oapen_db import OapenDB
|
from data.oapen_db import OapenDB
|
||||||
from logger.base_logger import logger
|
from logger.base_logger import logger
|
||||||
from model.oapen_types import NgramRow, SuggestionRow
|
from model.oapen_types import Ngram, NgramRow, SuggestionRow
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
# initial seed -> get suggestions on everything n^2
|
# initial seed -> get suggestions on everything n^2
|
||||||
|
@ -18,33 +18,41 @@ from tqdm.auto import tqdm
|
||||||
# optimization: only suggest once per pair
|
# optimization: only suggest once per pair
|
||||||
|
|
||||||
|
|
||||||
def get_ngrams_list(arr: List[NgramRow]):
|
def truncate_ngrams_list(item: NgramRow) -> List[NgramRow]:
|
||||||
return [x[0] for x in arr[1][0 : min(len(arr[1]), config.TOP_K_NGRAMS_COUNT)]]
|
ngrams = [
|
||||||
|
Ngram(ngram=x[0], count=0)
|
||||||
|
for x in item.ngrams[0 : min(len(item.ngrams), config.TOP_K_NGRAMS_COUNT)]
|
||||||
|
]
|
||||||
|
return item._replace(ngrams=ngrams)
|
||||||
|
|
||||||
|
|
||||||
def suggestion_task(items, all_items, db_mutex, db):
|
def suggestion_task(items: List[NgramRow], all_items: List[NgramRow], db_mutex, db):
|
||||||
suggestions: List[SuggestionRow] = []
|
suggestions: List[SuggestionRow] = []
|
||||||
for item_a in items:
|
for item_a in items:
|
||||||
handle_a = item_a[0]
|
|
||||||
|
|
||||||
for item_b in all_items:
|
for item_b in all_items:
|
||||||
handle_b = item_b[0]
|
if item_a.handle == item_b.handle:
|
||||||
|
|
||||||
if handle_a == handle_b:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ngrams_shared = len(list(filter(lambda x: x in item_b[1], item_a[1])))
|
score = len(list(filter(lambda x: x in item_b.ngrams, item_a.ngrams)))
|
||||||
|
|
||||||
if ngrams_shared >= config.SCORE_THRESHOLD:
|
if score >= config.SCORE_THRESHOLD:
|
||||||
suggestions.append((handle_a, handle_a, handle_b, ngrams_shared))
|
suggestions.append(
|
||||||
|
SuggestionRow(
|
||||||
|
handle=item_a.handle,
|
||||||
|
suggestion=item_b.handle,
|
||||||
|
suggestion_name=item_b.name,
|
||||||
|
suggestion_thumbnail=item_b.thumbnail,
|
||||||
|
score=score,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
db_mutex.acquire()
|
if len(suggestions) > 0:
|
||||||
db.add_many_suggestions(suggestions)
|
db_mutex.acquire()
|
||||||
db_mutex.release()
|
db.add_many_suggestions(suggestions)
|
||||||
|
db_mutex.release()
|
||||||
|
|
||||||
return len(items)
|
return len(items)
|
||||||
|
|
||||||
|
|
||||||
def refresh(future, counter, pbar):
|
def refresh(future, counter, pbar):
|
||||||
pbar.update(future.result())
|
pbar.update(future.result())
|
||||||
counter["items_updated"] += future.result()
|
counter["items_updated"] += future.result()
|
||||||
|
@ -78,9 +86,8 @@ def run():
|
||||||
time_start = time.perf_counter()
|
time_start = time.perf_counter()
|
||||||
|
|
||||||
# Get only top k ngrams for all items before processing
|
# Get only top k ngrams for all items before processing
|
||||||
for item in all_items:
|
for i in range(len(all_items)):
|
||||||
ngrams = get_ngrams_list(item)
|
all_items[i] = truncate_ngrams_list(all_items[i])
|
||||||
item = (item[0], ngrams)
|
|
||||||
|
|
||||||
chunks = [
|
chunks = [
|
||||||
all_items[i : i + config.SUGGESTIONS_MAX_ITEMS]
|
all_items[i : i + config.SUGGESTIONS_MAX_ITEMS]
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
import random
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
|
@ -89,7 +90,7 @@ def run():
|
||||||
future = io_executor.submit(harvest_task, url[0], item_queue)
|
future = io_executor.submit(harvest_task, url[0], item_queue)
|
||||||
future.add_done_callback(lambda x: refresh(x, pbar, counter))
|
future.add_done_callback(lambda x: refresh(x, pbar, counter))
|
||||||
producer_futures.append(future)
|
producer_futures.append(future)
|
||||||
time.sleep(config.HARVEST_THREAD_SPAWN_DELAY)
|
time.sleep(random.randint(1, config.HARVEST_THREAD_SPAWN_DELAY))
|
||||||
|
|
||||||
for future in concurrent.futures.as_completed(producer_futures):
|
for future in concurrent.futures.as_completed(producer_futures):
|
||||||
producers_done += 1
|
producers_done += 1
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import itertools
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from threading import Event
|
from threading import Event
|
||||||
|
|
||||||
|
@ -13,22 +14,20 @@ def db_task(db: OapenDB, db_queue: multiprocessing.Queue, event: Event):
|
||||||
|
|
||||||
def insert_items(entries):
|
def insert_items(entries):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
urls = [e[0] for e in entries]
|
urls = [e[0] for e in entries]
|
||||||
items = []
|
items = list(itertools.chain(*[e[1] for e in entries]))
|
||||||
|
|
||||||
for e in entries:
|
logger.info("(DB) - Inserting {0} item(s).".format(len(items)))
|
||||||
items += e[1]
|
|
||||||
|
|
||||||
logger.debug("(DB) - Inserting {0} item(s).".format(len(items)))
|
|
||||||
|
|
||||||
db.add_many_ngrams(items)
|
db.add_many_ngrams(items)
|
||||||
|
|
||||||
logger.debug("(DB) - Inserted {0} item(s).".format(len(items)))
|
logger.info("(DB) - Inserted {0} item(s).".format(len(items)))
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
db.update_url(url, True)
|
db.update_url(url, True)
|
||||||
|
|
||||||
return len(items)
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
return -1
|
return -1
|
||||||
|
|
|
@ -33,7 +33,6 @@ def harvest_task(url: str, items: multiprocessing.JoinableQueue) -> int or None:
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("(IO) (will retry) - " + str(e))
|
logger.error("(IO) (will retry) - " + str(e))
|
||||||
|
|
||||||
time.sleep(RETRY_DELAY)
|
time.sleep(RETRY_DELAY)
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import queue
|
import queue
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import model.ngrams as OapenEngine
|
import model.ngrams as OapenEngine
|
||||||
from logger.base_logger import logger
|
from logger.base_logger import logger
|
||||||
|
from model.oapen_types import OapenItem
|
||||||
|
|
||||||
|
|
||||||
def ngrams_task(
|
def ngrams_task(
|
||||||
|
@ -15,7 +17,8 @@ def ngrams_task(
|
||||||
try:
|
try:
|
||||||
entry = item_queue.get_nowait()
|
entry = item_queue.get_nowait()
|
||||||
|
|
||||||
url, items = entry[0], entry[1]
|
url: str = entry[0]
|
||||||
|
items: List[OapenItem] = entry[1]
|
||||||
|
|
||||||
ngrams = OapenEngine.get_ngrams_for_items(items)
|
ngrams = OapenEngine.get_ngrams_for_items(items)
|
||||||
|
|
||||||
|
|
|
@ -1,28 +1,34 @@
|
||||||
|
import test_ngrams
|
||||||
import test_oapen
|
import test_oapen
|
||||||
import test_stopwords
|
import test_stopwords
|
||||||
import test_ngrams
|
|
||||||
|
|
||||||
def run_test(run_msg, func):
|
def run_test(run_msg, func):
|
||||||
print(run_msg, end = " ")
|
print(run_msg, end=" ")
|
||||||
func()
|
func()
|
||||||
print("OK") # will throw on fail
|
print("OK") # will throw on fail
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("Testing connection to OAPEN.")
|
print("Testing connection to OAPEN.")
|
||||||
try:
|
try:
|
||||||
run_test("Attempting to get item [Embodying Contagion]:", test_oapen.test_get_item)
|
run_test(
|
||||||
run_test("Attempting to get null item:", test_oapen.test_get_item_404)
|
"Attempting to get collection limit by id (ea93f8f0-430f-4a03-b7e2-5b06053585b0):",
|
||||||
run_test("Attempting to get collection limit by label [Knowledge Unlatched (KU)]:",
|
test_oapen.test_get_collection_limit,
|
||||||
test_oapen.test_get_collection_limit)
|
)
|
||||||
run_test("Attempting to get null collection:", test_oapen.test_get_collection_404)
|
run_test(
|
||||||
|
"Attempting to get null collection:", test_oapen.test_get_collection_404
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("\nFailed:")
|
print("\nFailed:")
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
print("\nTesting stopwords generation.")
|
print("\nTesting stopwords generation.")
|
||||||
try:
|
try:
|
||||||
run_test("Testing stopwords correctly generated:",
|
run_test(
|
||||||
test_stopwords.test_stopwords_contains_all)
|
"Testing stopwords correctly generated:",
|
||||||
|
test_stopwords.test_stopwords_contains_all,
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Failed:")
|
print("Failed:")
|
||||||
print(e)
|
print(e)
|
||||||
|
@ -32,10 +38,11 @@ def main():
|
||||||
run_test("Testing process_text:", test_ngrams.test_process_text)
|
run_test("Testing process_text:", test_ngrams.test_process_text)
|
||||||
run_test("Testing ngram generation:", test_ngrams.test_generate_ngram)
|
run_test("Testing ngram generation:", test_ngrams.test_generate_ngram)
|
||||||
run_test("Testing similarity score:", test_ngrams.test_similarity_score)
|
run_test("Testing similarity score:", test_ngrams.test_similarity_score)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Failed:")
|
print("Failed:")
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -1,27 +1,13 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import data.oapen as OapenAPI
|
import data.oapen as OapenAPI
|
||||||
from model.oapen_types import OapenItem
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_item():
|
|
||||||
item = OapenAPI.get_item("20.500.12657/47586")
|
|
||||||
assert isinstance(item, OapenItem)
|
|
||||||
assert item.name == "Embodying Contagion"
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_item_404():
|
|
||||||
item: List[OapenItem] = OapenAPI.get_item("20.400.12657/47581")
|
|
||||||
assert len(item) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_collection_limit():
|
def test_get_collection_limit():
|
||||||
collection = OapenAPI.get_collection_items_by_label(
|
collection = OapenAPI.get_collection_items_by_id(
|
||||||
"Knowledge Unlatched (KU)", limit=10
|
"ea93f8f0-430f-4a03-b7e2-5b06053585b0", limit=10
|
||||||
)
|
)
|
||||||
assert len(collection) <= 10
|
assert len(collection) <= 10
|
||||||
|
|
||||||
|
|
||||||
def test_get_collection_404():
|
def test_get_collection_404():
|
||||||
collection = OapenAPI.get_collection_items_by_label("hahaha", limit=10)
|
collection = OapenAPI.get_collection_items_by_id("hahaha", limit=10)
|
||||||
assert len(collection) == 0
|
assert collection is None
|
||||||
|
|
Loading…
Reference in New Issue