Select most complete date

in `copyrightEntry` XML entries there are frequently repeating date elements that have the same values. These can be different and more or less complete. The most common example of this is an entry having two `pub_date` values of `1932` and `1932-06-15` for example.

Previously this process simply took the first date. This updates the date parser to sort all dates of the same type by length, meaning the most complete dates are parsed. If they cannot be parsed the other, less complete, dates are used.

This was prompted by the presence of false January 1st dates and also the discovery that publication dates are often used as replacement registration dates in renewals.
select-best-date
Mike Benowitz 2019-08-21 11:29:22 -04:00
parent 910d514e08
commit 5e02c87657
2 changed files with 58 additions and 7 deletions

View File

@ -432,13 +432,18 @@ class CCEFile():
@staticmethod @staticmethod
def fetchDateValue(date, text=False): def fetchDateValue(date, text=False):
date.sort(key=lambda x: len(x[0]), reverse=True)
if len(date) > 0: if len(date) > 0:
if text is False: return CCEFile.dateReader(date[0], text)
try:
return parser.parse(date[0][0])
except (ValueError, TypeError):
pass
else:
return date[0][1]
return None return None
@staticmethod
def dateReader(date, text=False):
if text is False:
try:
return parser.parse(date[0])
except (ValueError, TypeError):
return None
else:
return date[1]

View File

@ -0,0 +1,46 @@
from datetime import datetime
from builder import CCEFile
class TestEntryBuilder(object):
def test_dateReader(self):
outDate = CCEFile.dateReader(('2019-01-01', 'Jan. 1, 2019'))
assert outDate == datetime(2019, 1, 1)
def test_dateReader_text(self):
outDate = CCEFile.dateReader(('2019-01-01', 'Jan. 1, 2019'), text=True)
assert outDate == 'Jan. 1, 2019'
def test_dateReader_error(self, mocker):
mockParser = mocker.patch('builder.parser.parse')
mockParser.side_effect = ValueError
outDate = CCEFile.dateReader('2020')
assert outDate is None
def test_fetchDateValue(self):
outDate = CCEFile.fetchDateValue([])
assert outDate is None
def test_fetchDateValue_single(self, mocker):
mockReader = mocker.patch('builder.CCEFile.dateReader')
mockReader.return_value = True
mockDate = ('2019-01-01', 'Jan. 1')
outDate = CCEFile.fetchDateValue([mockDate])
mockReader.assert_called_once_with(mockDate, False)
assert outDate
def test_fetchDateValue_multiple(self, mocker):
mockReader = mocker.patch('builder.CCEFile.dateReader')
mockReader.return_value = True
firstDate = ('2019', '2019')
secondDate = ('2019-01', 'January')
thirdDate = ('2019-01-01', 'Jan. 1')
outDate = CCEFile.fetchDateValue([firstDate, secondDate, thirdDate])
mockReader.assert_called_once_with(thirdDate, False)
assert outDate