Select most complete date
in `copyrightEntry` XML entries there are frequently repeating date elements that have the same values. These can be different and more or less complete. The most common example of this is an entry having two `pub_date` values of `1932` and `1932-06-15` for example. Previously this process simply took the first date. This updates the date parser to sort all dates of the same type by length, meaning the most complete dates are parsed. If they cannot be parsed the other, less complete, dates are used. This was prompted by the presence of false January 1st dates and also the discovery that publication dates are often used as replacement registration dates in renewals.select-best-date
parent
910d514e08
commit
5e02c87657
19
builder.py
19
builder.py
|
@ -432,13 +432,18 @@ class CCEFile():
|
|||
|
||||
@staticmethod
|
||||
def fetchDateValue(date, text=False):
|
||||
date.sort(key=lambda x: len(x[0]), reverse=True)
|
||||
if len(date) > 0:
|
||||
if text is False:
|
||||
try:
|
||||
return parser.parse(date[0][0])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
else:
|
||||
return date[0][1]
|
||||
return CCEFile.dateReader(date[0], text)
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def dateReader(date, text=False):
|
||||
if text is False:
|
||||
try:
|
||||
return parser.parse(date[0])
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
else:
|
||||
return date[1]
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
from datetime import datetime
|
||||
|
||||
from builder import CCEFile
|
||||
|
||||
|
||||
class TestEntryBuilder(object):
|
||||
def test_dateReader(self):
|
||||
outDate = CCEFile.dateReader(('2019-01-01', 'Jan. 1, 2019'))
|
||||
assert outDate == datetime(2019, 1, 1)
|
||||
|
||||
def test_dateReader_text(self):
|
||||
outDate = CCEFile.dateReader(('2019-01-01', 'Jan. 1, 2019'), text=True)
|
||||
assert outDate == 'Jan. 1, 2019'
|
||||
|
||||
def test_dateReader_error(self, mocker):
|
||||
mockParser = mocker.patch('builder.parser.parse')
|
||||
mockParser.side_effect = ValueError
|
||||
|
||||
outDate = CCEFile.dateReader('2020')
|
||||
assert outDate is None
|
||||
|
||||
def test_fetchDateValue(self):
|
||||
outDate = CCEFile.fetchDateValue([])
|
||||
assert outDate is None
|
||||
|
||||
def test_fetchDateValue_single(self, mocker):
|
||||
mockReader = mocker.patch('builder.CCEFile.dateReader')
|
||||
mockReader.return_value = True
|
||||
|
||||
mockDate = ('2019-01-01', 'Jan. 1')
|
||||
outDate = CCEFile.fetchDateValue([mockDate])
|
||||
|
||||
mockReader.assert_called_once_with(mockDate, False)
|
||||
assert outDate
|
||||
|
||||
def test_fetchDateValue_multiple(self, mocker):
|
||||
mockReader = mocker.patch('builder.CCEFile.dateReader')
|
||||
mockReader.return_value = True
|
||||
|
||||
firstDate = ('2019', '2019')
|
||||
secondDate = ('2019-01', 'January')
|
||||
thirdDate = ('2019-01-01', 'Jan. 1')
|
||||
outDate = CCEFile.fetchDateValue([firstDate, secondDate, thirdDate])
|
||||
|
||||
mockReader.assert_called_once_with(thirdDate, False)
|
||||
assert outDate
|
Loading…
Reference in New Issue