313 lines
11 KiB
Python
313 lines
11 KiB
Python
"""
|
|
This takes a MARCXML filename as an argument and converts it into
|
|
MARC records for the unglued edition (in .xml and .mrc formats).
|
|
Consider it a catalogolem: http://commons.wikimedia.org/wiki/File:Arcimboldo_Librarian_Stokholm.jpg
|
|
Use the MARCXML file for the non-unglued edition from Library of Congress.
|
|
"""
|
|
|
|
import logging
|
|
import pymarc
|
|
from copy import deepcopy
|
|
from datetime import datetime
|
|
from StringIO import StringIO
|
|
|
|
from django.conf import settings
|
|
from django.core.files.storage import default_storage
|
|
from django.core.urlresolvers import reverse
|
|
|
|
from regluit.core import models
|
|
|
|
def makemarc(marcfile, edition):
|
|
"""
|
|
fyi if we're going to suck down LOC records directly:
|
|
parse_xml_to_array takes a file, so we need to faff about with file writes
|
|
would be nice to have a suitable z39.50
|
|
can use LCCN to grab record with urllib, but file writes are inconsistent
|
|
"""
|
|
license = edition.ebooks.all()[0].rights
|
|
logger = logging.getLogger(__name__)
|
|
logger.info("Making MARC records for edition %s and license %s" % (edition, license))
|
|
if '/unglue.it' in settings.BASE_URL:
|
|
directory = 'marc'
|
|
else:
|
|
directory = 'marc_test'
|
|
|
|
record = pymarc.parse_xml_to_array(marcfile)[0]
|
|
|
|
# save this for later before deleting it
|
|
print_lccn = record.get_fields('010')[0].get_subfields('a')[0]
|
|
|
|
fields_to_delete = []
|
|
fields_to_delete += record.get_fields('001')
|
|
fields_to_delete += record.get_fields('003')
|
|
fields_to_delete += record.get_fields('005')
|
|
fields_to_delete += record.get_fields('006')
|
|
fields_to_delete += record.get_fields('007')
|
|
fields_to_delete += record.get_fields('010')
|
|
fields_to_delete += record.get_fields('040')
|
|
for field in fields_to_delete:
|
|
record.remove_field(field)
|
|
|
|
# create accession number and write 001 field
|
|
# (control field syntax is special)
|
|
(marc_record, created) = models.MARCRecord.objects.get_or_create(edition=edition,link_target='DIRECT')
|
|
marc_id = marc_record.id
|
|
zeroes = 9 - len(str(marc_id))
|
|
accession = 'ung' + zeroes*'0' + str(marc_id)
|
|
field001 = pymarc.Field(tag='001', data=accession)
|
|
record.add_ordered_field(field001)
|
|
|
|
# add field indicating record originator
|
|
field003 = pymarc.Field(tag='003', data='UnglueIt')
|
|
record.add_ordered_field(field003)
|
|
|
|
# update timestamp of record
|
|
now = datetime.now()
|
|
datestamp = now.strftime('%Y%m%d%H%M%S') + '.0'
|
|
field005 = pymarc.Field(tag='005', data=datestamp)
|
|
record.add_ordered_field(field005)
|
|
|
|
# change 006, 007, 008 because this is an online resource
|
|
field006 = pymarc.Field(
|
|
tag='006',
|
|
data='m o d '
|
|
)
|
|
record.add_ordered_field(field006)
|
|
|
|
field007 = pymarc.Field(
|
|
tag='007',
|
|
data='cr'
|
|
)
|
|
record.add_ordered_field(field007)
|
|
|
|
field008 = record.get_fields('008')[0]
|
|
record.remove_field(field008)
|
|
old_field_value = field008.value()
|
|
new_field_value = old_field_value[:23] + 'o' + old_field_value[24:]
|
|
field008 = pymarc.Field(tag='008', data=new_field_value)
|
|
record.add_ordered_field(field008)
|
|
|
|
# add IBSN for ebook where applicable; relegate print ISBN to $z
|
|
isbn = ''
|
|
try:
|
|
isbn = edition.identifiers.filter(type='isbn')[0].value
|
|
except IndexError:
|
|
pass
|
|
try:
|
|
field020 = record.get_fields('020')[0]
|
|
print_isbn = field020.get_subfields('a')[0]
|
|
field020.delete_subfield('a')
|
|
if isbn:
|
|
field020.add_subfield('a', isbn)
|
|
field020.add_subfield('z', print_isbn)
|
|
except IndexError:
|
|
print_isbn = None
|
|
|
|
# change 050 and 082 indicators because LOC is no longer responsible for these
|
|
# no easy indicator change function, so we'll just reconstruct the fields
|
|
try:
|
|
field050 = record.get_fields('050')[0]
|
|
field050_new = field050
|
|
field050_new.indicators = [' ', '4']
|
|
record.remove_field(field050)
|
|
record.add_ordered_field(field050_new)
|
|
except:
|
|
pass # if no 050 field, don't need to change indicator
|
|
|
|
try:
|
|
field082 = record.get_fields('082')[0]
|
|
field082_new = field082
|
|
field082_new.indicators = [' ', '4']
|
|
record.remove_field(field082)
|
|
record.add_ordered_field(field082_new)
|
|
except:
|
|
pass # if no 082 field, don't need to change indicator
|
|
|
|
# add subfield to 245 indicating format
|
|
field245 = record.get_fields('245')[0]
|
|
field245.add_subfield('h', '[electronic resource]')
|
|
|
|
# modify 300 field (physical description)
|
|
field300 = record.get_fields('300')[0]
|
|
subfield_a = field300.get_subfields('a')[0]
|
|
if (
|
|
subfield_a[-2:] == ' ;' or
|
|
subfield_a[-2:] == ' :' or
|
|
subfield_a[-2:] == ' +'
|
|
):
|
|
subfield_a = subfield_a[:-2]
|
|
new300a = '1 online resource (' + subfield_a + ')'
|
|
if field300.get_subfields('b'):
|
|
new300a += ' :'
|
|
field300.delete_subfield('a')
|
|
field300.add_subfield('a', new300a)
|
|
field300.delete_subfield('c')
|
|
|
|
# add 536 field (funding information)
|
|
if edition.unglued:
|
|
funding_info = 'The book is available as a free download thanks to the generous support of interested readers and organizations, who made donations using the crowd-funding website Unglue.it.'
|
|
else:
|
|
if edition.ebooks.all()[0].rights in ['CC BY', 'CC BY-NC-SA', 'CC BY-NC-ND', 'CC BY-NC', 'CC BY-ND', 'CC BY-SA']:
|
|
funding_info = 'The book is available as a free download thanks to a Creative Commons license.'
|
|
else:
|
|
funding_info = 'The book is available as a free download because it is in the Public Domain.'
|
|
field536 = pymarc.Field(
|
|
tag='536',
|
|
indicators = [' ', ' '],
|
|
subfields = [
|
|
'a', funding_info,
|
|
]
|
|
)
|
|
record.add_ordered_field(field536)
|
|
|
|
# add 540 field (terms governing use)
|
|
field540 = pymarc.Field(
|
|
tag='540',
|
|
indicators = [' ', ' '],
|
|
subfields = [
|
|
'a', dict(settings.CHOICES)[license],
|
|
'u', dict(settings.GRANTS)[license],
|
|
]
|
|
)
|
|
record.add_ordered_field(field540)
|
|
|
|
# add 588 field (source of description) - credit where credit is due
|
|
field588 = pymarc.Field(
|
|
tag='588',
|
|
indicators = [' ', ' '],
|
|
subfields = [
|
|
'a', 'Description based on print version record from the Library of Congress.',
|
|
]
|
|
)
|
|
record.add_ordered_field(field588)
|
|
|
|
# add 776 field (related editions) - preserve pISBN, LCCN, OCLCnum
|
|
title = record.get_fields('245')[0].get_subfields('a')[0]
|
|
title = title.split('/')[0]
|
|
try:
|
|
oclcnum = edition.identifiers.filter(type='oclc')
|
|
except IndexError:
|
|
oclcnum = None
|
|
|
|
subfields = ['i', 'Print version: ','t', title,]
|
|
|
|
if print_isbn:
|
|
subfields.extend(['z', print_isbn])
|
|
subfields.extend(['w', '(DLC) ' + print_lccn, ])
|
|
if oclcnum:
|
|
subfields.extend(['w', '(OCoLC) ' + oclcnum,])
|
|
|
|
field776 = pymarc.Field(
|
|
tag='776',
|
|
indicators = ['0', '8'],
|
|
subfields = subfields
|
|
)
|
|
|
|
record.add_ordered_field(field776)
|
|
"""
|
|
add 776 fields
|
|
indicators: 0 8
|
|
'$i Print version: '
|
|
$t Title. <--note space
|
|
$d is optional
|
|
$z pISBN goes here
|
|
harvest from 020 (was moved from $a to $z)
|
|
$w (DLC) LCCN_goes_here
|
|
harvest from 010 field before deletion
|
|
$w (OCoLC) OCLCnum_goes_here
|
|
harvest from identifiers db
|
|
"""
|
|
|
|
# strip any 9XX fields (they're for local use)
|
|
for i in range(900, 1000):
|
|
fields = record.get_fields(str(i))
|
|
for field in fields:
|
|
record.remove_field(field)
|
|
|
|
# add 856 fields with links for each available file
|
|
# doing this out of order as it's the only thing that differs
|
|
# between direct-link and via-unglue.it versions
|
|
# need deepcopy() because omg referential transparency!
|
|
record_via_unglueit = deepcopy(record)
|
|
|
|
content_types = settings.CONTENT_TYPES
|
|
for format_tuple in settings.FORMATS:
|
|
format = format_tuple[0]
|
|
ebooks = edition.ebooks.filter(format=format)
|
|
if ebooks:
|
|
for book in ebooks:
|
|
field856 = pymarc.Field(
|
|
tag='856',
|
|
indicators = ['4', '0'],
|
|
subfields = [
|
|
'3', format + ' version',
|
|
'q', content_types[format],
|
|
'u', book.url,
|
|
]
|
|
)
|
|
record.add_ordered_field(field856)
|
|
|
|
unglued_url = settings.BASE_URL_SECURE + reverse('download', args=[edition.work.id])
|
|
field856_via = pymarc.Field(
|
|
tag='856',
|
|
indicators = ['4', '0'],
|
|
subfields = [
|
|
'u', unglued_url,
|
|
]
|
|
)
|
|
record_via_unglueit.add_ordered_field(field856_via)
|
|
|
|
# this via_unglueit record needs its own accession number
|
|
field001 = record_via_unglueit.get_fields('001')[0]
|
|
record_via_unglueit.remove_field(field001)
|
|
(marc_record_via, created) = models.MARCRecord.objects.get_or_create(edition=edition,link_target='UNGLUE')
|
|
marc_id_via = marc_record_via.id
|
|
zeroes = 9 - len(str(marc_id_via))
|
|
accession_via = 'ung' + zeroes*'0' + str(marc_id_via)
|
|
field001 = pymarc.Field(tag='001', data=accession_via)
|
|
record_via_unglueit.add_ordered_field(field001)
|
|
|
|
# write the unglued MARCxml records
|
|
xml_filename = directory + '/' + accession + '_unglued.xml'
|
|
xmlrecord = pymarc.record_to_xml(record)
|
|
xml_file = default_storage.open(xml_filename, 'w')
|
|
xml_file.write(xmlrecord)
|
|
xml_file.close()
|
|
logger.info("MARCXML record for edition %s written to S3" % edition)
|
|
|
|
xml_filename_via = directory + '/' + accession_via + '_via_unglueit.xml'
|
|
xmlrecord = pymarc.record_to_xml(record_via_unglueit)
|
|
xml_file = default_storage.open(xml_filename_via, 'w')
|
|
xml_file.write(xmlrecord)
|
|
xml_file.close()
|
|
logger.info("MARCXML record for edition %s via unglue.it written to S3" % edition)
|
|
|
|
# write the unglued .mrc records, then save to s3
|
|
string = StringIO()
|
|
mrc_filename = directory + '/' + accession + '_unglued.mrc'
|
|
writer = pymarc.MARCWriter(string)
|
|
writer.write(record)
|
|
mrc_file = default_storage.open(mrc_filename, 'w')
|
|
mrc_file.write(string.getvalue())
|
|
mrc_file.close()
|
|
logger.info(".mrc record for edition %s written to S3" % edition)
|
|
|
|
string = StringIO()
|
|
mrc_filename_via = directory + '/' + accession_via + '_via_unglueit.mrc'
|
|
writer = pymarc.MARCWriter(string)
|
|
writer.write(record_via_unglueit)
|
|
mrc_file = default_storage.open(mrc_filename_via, 'w')
|
|
mrc_file.write(string.getvalue())
|
|
mrc_file.close()
|
|
logger.info(".mrc record for edition %s via unglue.it written to S3" % edition)
|
|
|
|
marc_record.xml_record = default_storage.url(xml_filename)
|
|
marc_record.mrc_record = default_storage.url(mrc_filename)
|
|
marc_record.link_target = 'DIRECT'
|
|
marc_record.save()
|
|
marc_record_via.xml_record = default_storage.url(xml_filename_via)
|
|
marc_record_via.mrc_record = default_storage.url(mrc_filename_via)
|
|
marc_record_via.link_target = 'UNGLUE'
|
|
marc_record_via.save()
|
|
logger.info("MARCRecord instances complete for edition %s with accession numbers %s and %s" % (edition, accession, accession_via))
|