424 lines
15 KiB
Python
424 lines
15 KiB
Python
import base64
|
|
from datetime import datetime
|
|
from github import Github
|
|
import lccnorm
|
|
from lxml import etree
|
|
import os
|
|
import re
|
|
import traceback
|
|
import sys
|
|
|
|
import io
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
|
|
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
|
|
|
|
from model.cce import CCE
|
|
from model.errorCCE import ErrorCCE
|
|
from model.volume import Volume
|
|
|
|
from helpers.errors import DataError
|
|
|
|
class CCEReader():
|
|
def __init__(self, manager=None):
|
|
self.git = Github(os.environ['ACCESS_TOKEN'])
|
|
self.repo = self.git.get_repo(os.environ['CCE_REPO'])
|
|
self.dbManager = manager
|
|
self.cceYears = {}
|
|
|
|
def loadYears(self, selectedYear):
|
|
for year in self.repo.get_contents('/xml'):
|
|
if not re.match(r'^19[0-9]{2}$', year.name): continue
|
|
if selectedYear is not None and year.name != selectedYear: continue
|
|
yearInfo = {'path': year.path, 'yearFiles': []}
|
|
self.cceYears[year.name] = yearInfo
|
|
|
|
def loadYearFiles(self, year, loadFromTime):
|
|
yearInfo = self.cceYears[year]
|
|
for yearFile in self.repo.get_contents(yearInfo['path']):
|
|
if 'alto' in yearFile.name or 'TOC' in yearFile.name: continue
|
|
fileCommit = self.repo.get_commits(path=yearFile.path)[0]
|
|
commitDate = fileCommit.commit.committer.date
|
|
if loadFromTime is not None and commitDate < loadFromTime: continue
|
|
self.cceYears[year]['yearFiles'].append({
|
|
'filename': yearFile.name,
|
|
'path': yearFile.path,
|
|
'sha': yearFile.sha
|
|
})
|
|
|
|
def getYearFiles(self, loadFromTime):
|
|
for year in self.cceYears.keys():
|
|
print('Loading files for {}'.format(year))
|
|
self.loadYearFiles(year, loadFromTime)
|
|
|
|
def importYearData(self):
|
|
for year in self.cceYears.keys(): self.importYear(year)
|
|
|
|
def importYear(self, year):
|
|
yearFiles = self.cceYears[year]['yearFiles']
|
|
for yearFile in yearFiles: self.importFile(yearFile)
|
|
|
|
def importFile(self, yearFile):
|
|
print('Importing data from {}'.format(yearFile['filename']))
|
|
cceFile = CCEFile(self.repo, yearFile, self.dbManager.session)
|
|
cceFile.loadFileXML()
|
|
cceFile.readXML()
|
|
self.dbManager.commitChanges()
|
|
|
|
|
|
class CCEFile():
|
|
tagOptions = {
|
|
'header': 'skipElement',
|
|
'page': 'parsePage',
|
|
'copyrightEntry': 'parseEntry',
|
|
'entryGroup': 'parseGroup',
|
|
'crossRef': 'skipElement'
|
|
}
|
|
|
|
dateTypes = ['regDate', 'copyDate', 'pubDate', 'affDate']
|
|
|
|
def __init__(self, repo, cceFile, session):
|
|
self.repo = repo
|
|
self.cceFile = cceFile
|
|
self.session = session
|
|
|
|
self.xmlString = None
|
|
self.root = None
|
|
self.fileHeader = None
|
|
self.currentPage = 0
|
|
self.pagePos = 0
|
|
|
|
def loadFileXML(self):
|
|
yearBlob = self.repo.get_git_blob(self.cceFile['sha'])
|
|
self.xmlString = base64.b64decode(yearBlob.content)
|
|
self.root = etree.fromstring(self.xmlString)
|
|
|
|
def readXML(self):
|
|
self.loadHeader()
|
|
|
|
for child in self.root:
|
|
childOp = getattr(self, CCEFile.tagOptions[child.tag])
|
|
self.pagePos += 1
|
|
try:
|
|
childOp(child)
|
|
except DataError as err:
|
|
print('Caught error')
|
|
self.createErrorEntry(
|
|
getattr(err, 'uuid', child.get('id')),
|
|
getattr(err, 'regnum', None),
|
|
getattr(err, 'entry', child),
|
|
getattr(err, 'message', None)
|
|
)
|
|
except Exception as err:
|
|
print('ERROR', err)
|
|
print(traceback.print_exc())
|
|
raise err
|
|
|
|
def skipElement(self, el):
|
|
print('Skipping element {}'.format(el.tag))
|
|
|
|
def parsePage(self, page):
|
|
self.currentPage = page.get('pgnum')
|
|
self.pagePos = 0
|
|
|
|
def parseEntry(self, entry, shared=[]):
|
|
uuid = entry.get('id')
|
|
|
|
if 'regnum' not in entry.attrib:
|
|
print('Entry Missing REGNUM')
|
|
raise DataError(
|
|
'missing_regnum',
|
|
uuid=uuid,
|
|
entry=entry
|
|
)
|
|
|
|
regnums = self.loadRegnums(entry)
|
|
|
|
entryDates = self.loadDates(entry)
|
|
regDates = entryDates['regDate']
|
|
if(len(regnums) != len(regDates)):
|
|
if len(regDates) == 1 and len(regnums) > 0:
|
|
regDates = [regDates[0]] * len(regnums)
|
|
else:
|
|
raise DataError(
|
|
'regnum_date_mismatch',
|
|
uuid=uuid,
|
|
regnum='; '.join(regnums),
|
|
entry=entry
|
|
)
|
|
|
|
regs = self.createRegistrations(regnums, regDates)
|
|
existingRec = self.matchUUID(uuid)
|
|
if existingRec:
|
|
self.updateEntry(existingRec, entryDates, entry, shared, regs)
|
|
else:
|
|
self.createEntry(uuid, entryDates, entry, shared, regs)
|
|
|
|
def matchUUID(self, uuid):
|
|
return self.session.query(CCE).filter(CCE.uuid == uuid).one_or_none()
|
|
|
|
def createEntry(self, uuid, dates, entry, shared, registrations):
|
|
titles = self.createTitleList(entry, shared)
|
|
authors = self.createAuthorList(entry, shared)
|
|
copies = CCEFile.fetchText(entry, 'copies')
|
|
description = CCEFile.fetchText(entry, 'desc')
|
|
newMatter = len(entry.findall('newMatterClaimed')) > 0
|
|
|
|
publishers = [
|
|
(c.text, c.get('claimant', None))
|
|
for c in entry.findall('.//pubName')
|
|
]
|
|
|
|
lccn = [ lccnorm.normalize(l.text) for l in entry.findall('lccn') ]
|
|
|
|
cceRec = CCE(
|
|
uuid=uuid,
|
|
page=self.currentPage,
|
|
page_position=self.pagePos,
|
|
title=titles,
|
|
copies=copies,
|
|
description=description,
|
|
new_matter=newMatter,
|
|
pub_date=CCEFile.fetchDateValue(dates['pubDate'], text=False),
|
|
pub_date_text=CCEFile.fetchDateValue(dates['pubDate'], text=True),
|
|
aff_date=CCEFile.fetchDateValue(dates['affDate'], text=False),
|
|
aff_date_text=CCEFile.fetchDateValue(dates['affDate'], text=True),
|
|
copy_date=CCEFile.fetchDateValue(dates['copyDate'], text=False),
|
|
copy_date_text=CCEFile.fetchDateValue(dates['copyDate'], text=True)
|
|
)
|
|
cceRec.addRelationships(
|
|
self.fileHeader,
|
|
entry,
|
|
authors=authors,
|
|
publishers=publishers,
|
|
lccn=lccn,
|
|
registrations=registrations
|
|
)
|
|
self.session.add(cceRec)
|
|
print('INSERT', cceRec)
|
|
|
|
def updateEntry(self, rec, dates, entry, shared, registrations):
|
|
rec.title = self.createTitleList(entry, shared)
|
|
rec.copies = CCEFile.fetchText(entry, 'copies')
|
|
rec.description = CCEFile.fetchText(entry, 'desc')
|
|
rec.new_matter = len(entry.findall('newMatterClaimed')) > 0
|
|
rec.page = self.currentPage
|
|
rec.page_position = self.pagePos
|
|
|
|
rec.pub_date = CCEFile.fetchDateValue(dates['pubDate'], text=False)
|
|
rec.pub_date_text = CCEFile.fetchDateValue(dates['pubDate'], text=True)
|
|
rec.aff_date = CCEFile.fetchDateValue(dates['affDate'], text=False)
|
|
rec.aff_date_text = CCEFile.fetchDateValue(dates['affDate'], text=True)
|
|
rec.copy_date = CCEFile.fetchDateValue(dates['copyDate'], text=False)
|
|
rec.copy_date_text = CCEFile.fetchDateValue(dates['copyDate'], text=True)
|
|
|
|
authors = self.createAuthorList(entry, shared)
|
|
publishers = [
|
|
(c.text, c.get('claimant', None))
|
|
for c in entry.findall('.//pubName')
|
|
]
|
|
lccn = [ lccnorm.normalize(l.text) for l in entry.findall('lccn') ]
|
|
|
|
rec.updateRelationships(
|
|
entry,
|
|
authors=authors,
|
|
publishers=publishers,
|
|
lccn=lccn,
|
|
registrations=registrations
|
|
)
|
|
print('UPDATE', rec)
|
|
|
|
def createRegistrations(self, regnums, regdates):
|
|
registrations = []
|
|
for x in range(len(regnums)):
|
|
try:
|
|
regCat = re.match(r'([A-Z]+)', regnums[x][0]).group(1)
|
|
except AttributeError:
|
|
regCat ='Unknown'
|
|
registrations.append({
|
|
'regnum': regnums[x],
|
|
'category': regCat,
|
|
'regDate': regdates[x][0],
|
|
'regDateText': regdates[x][1]
|
|
})
|
|
return registrations
|
|
|
|
def loadRegnums(self, entry):
|
|
rawRegnums = entry.get('regnum').strip()
|
|
regnums = rawRegnums.split(' ')
|
|
regnums.extend(self.loadAddtlEntries(entry))
|
|
outNums = []
|
|
for num in regnums:
|
|
parsedNum = self.parseRegNum(num)
|
|
if type(parsedNum) is str: outNums.append(parsedNum)
|
|
else: outNums.extend(parsedNum)
|
|
return outNums
|
|
|
|
def loadAddtlEntries(self, entry):
|
|
moreRegnums = []
|
|
for addtl in entry.findall('.//additionalEntry'):
|
|
try:
|
|
addtlRegnum = addtl.get('regnum').strip()
|
|
moreRegnums.extend(addtlRegnum.split(' '))
|
|
except AttributeError:
|
|
continue
|
|
return moreRegnums
|
|
|
|
def parseRegNum(self, num):
|
|
try:
|
|
if (
|
|
re.search(r'[0-9]+\-[A-Z0-9]+', num)
|
|
and num[2] != '0'
|
|
and num[:2] != 'B5'
|
|
):
|
|
regnumRange = num.split('-')
|
|
regRangePrefix = re.match(r'([A-Z]+)', regnumRange[0]).group(1)
|
|
regRangeStart = int(regnumRange[0].replace(regRangePrefix, ''))
|
|
regRangeEnd = int(regnumRange[1].replace(regRangePrefix, ''))
|
|
if regRangeEnd - regRangeStart < 1000:
|
|
return [
|
|
'{}{}'.format(regRangePrefix, str(i))
|
|
for i in range(regRangeStart, regRangeEnd)
|
|
]
|
|
except (ValueError, AttributeError) as err:
|
|
print('RANGE ERROR', self.currentPage, self.pagePos)
|
|
raise DataError('regnum_range_parsing_error')
|
|
|
|
return num
|
|
|
|
def loadDates(self, entry):
|
|
dates = {}
|
|
for dType in CCEFile.dateTypes:
|
|
dates[dType] = [
|
|
(CCEFile.parseDate(d.get('date', None), '%Y-%m-%d'), d.text)
|
|
for d in entry.findall('.//{}'.format(dType))
|
|
if not ('ignore' in d.attrib and d.get('ignore') == 'yes')
|
|
]
|
|
if len(dates['regDate']) < 1:
|
|
if len(dates['copyDate']) > 0:
|
|
dates['regDate'].append(dates['copyDate'][0])
|
|
elif len(dates['pubDate']) > 0:
|
|
dates['regDate'].append(dates['pubDate'][0])
|
|
|
|
return dates
|
|
|
|
@staticmethod
|
|
def parseDate(date, dateFormat):
|
|
if date is None or dateFormat == '': return None
|
|
|
|
try:
|
|
outDate = datetime.strptime(date, dateFormat)
|
|
return outDate.strftime('%Y-%m-%d')
|
|
except ValueError:
|
|
return CCEFile.parseDate(date, dateFormat[:-3])
|
|
|
|
def createTitleList(self, entry, shared):
|
|
return '; '.join([
|
|
t.text
|
|
for t in entry.findall('.//title') + [
|
|
t for t in shared if t.tag == 'title'
|
|
]
|
|
if t.text is not None
|
|
])
|
|
|
|
def createErrorEntry(self, uuid, regnum, entry, reason):
|
|
errorCCE = ErrorCCE(
|
|
uuid=uuid,
|
|
regnum=regnum,
|
|
page=self.currentPage,
|
|
page_position=self.pagePos,
|
|
reason=reason
|
|
)
|
|
errorCCE.volume = self.fileHeader
|
|
errorCCE.addXML(entry)
|
|
self.session.add(errorCCE)
|
|
|
|
def parseGroup(self, group):
|
|
entries = []
|
|
shared = []
|
|
for field in group:
|
|
if field.tag == 'page': self.parsePage(field)
|
|
elif field.tag == 'copyrightEntry': entries.append(field)
|
|
else: shared.append(field)
|
|
|
|
for entry in entries:
|
|
try:
|
|
self.parseEntry(entry, shared)
|
|
except DataError as err:
|
|
err.uuid = entry.get('id')
|
|
raise err
|
|
|
|
def createAuthorList(self, rec, shared=[]):
|
|
authors = [
|
|
(a.text, False) for a in rec.findall('.//authorName')
|
|
if len(list(a.itersiblings(tag='role', preceding=True))) > 0
|
|
]
|
|
if len(authors) < 1:
|
|
try:
|
|
authors.append((rec.findall('.//authorName')[0].text, False))
|
|
except IndexError:
|
|
pass
|
|
|
|
authors.extend([
|
|
(a.text, False) for a in shared if a.tag == 'authorName'
|
|
])
|
|
|
|
try:
|
|
authors[0] = (authors[0][0], True)
|
|
except IndexError:
|
|
pass
|
|
|
|
return authors
|
|
|
|
def loadHeader(self):
|
|
header = self.root.find('.//header')
|
|
|
|
startNum = endNum = None
|
|
if header.find('cite/division/numbers') is not None:
|
|
startNum = header.find('cite/division/numbers').get('start', None)
|
|
endNum = header.find('cite/division/numbers').get('end', None)
|
|
elif header.find('cite/division/number') is not None:
|
|
volNum = CCEFile.fetchText(header, 'cite/division/number')
|
|
startNum = volNum
|
|
endNum= volNum
|
|
|
|
headerDict = {
|
|
'source': header.find('source').get('url', None),
|
|
'status': CCEFile.fetchText(header, 'status'),
|
|
'series': header.find('cite/series').get('label', None),
|
|
'volume': CCEFile.fetchText(header, 'cite/volume'),
|
|
'year': CCEFile.fetchText(header, 'cite/year'),
|
|
'part': CCEFile.fetchText(header, 'cite/division/part'),
|
|
'group': CCEFile.fetchText(header, 'cite/division/group'),
|
|
'material': CCEFile.fetchText(header, 'cite/division/material'),
|
|
'numbers': {
|
|
'start': startNum,
|
|
'end': endNum
|
|
}
|
|
}
|
|
|
|
self.fileHeader = Volume(
|
|
source=headerDict['source'],
|
|
status=headerDict['status'],
|
|
series=headerDict['series'],
|
|
volume=headerDict['volume'],
|
|
year=headerDict['year'],
|
|
part=headerDict['part'],
|
|
group=headerDict['group'],
|
|
material=headerDict['material'],
|
|
start_number=headerDict['numbers']['start'],
|
|
end_number=headerDict['numbers']['end']
|
|
)
|
|
self.session.add(self.fileHeader)
|
|
|
|
@staticmethod
|
|
def fetchText(parent, tag):
|
|
element = parent.find(tag)
|
|
return element.text if element is not None else None
|
|
|
|
@staticmethod
|
|
def fetchDateValue(date, text=False):
|
|
x = 1 if text else 0
|
|
return date[0][x] if len(date) > 0 else None
|