regluit/core/marc.py

368 lines
12 KiB
Python
Raw Normal View History

"""
This takes a MARCXML filename as an argument and converts it into
MARC records for the unglued edition (in .xml and .mrc formats).
Consider it a catalogolem: http://commons.wikimedia.org/wiki/File:Arcimboldo_Librarian_Stokholm.jpg
Use the MARCXML file for the non-unglued edition from Library of Congress.
"""
import pymarc
2014-10-14 14:08:08 +00:00
import logging
from copy import deepcopy
from datetime import datetime
from StringIO import StringIO
from django.conf import settings
from django.core.files.storage import default_storage
from django.core.urlresolvers import reverse
2014-05-08 14:21:50 +00:00
import regluit.core.cc as cc
from regluit.core import models
2014-10-14 14:08:08 +00:00
def makestub(edition):
return makemarc(None, edition)
def makemarc(marcfile, edition):
logger = logging.getLogger(__name__)
try:
license = edition.ebooks.all()[0].rights
ebf = None
except IndexError:
license = None
2014-10-14 14:08:08 +00:00
try:
ebf = edition.ebook_files.all()[0]
except IndexError:
# no record if no ebooks
return None
logger.info("Making MARC records for edition %s " % edition)
2014-09-12 15:49:07 +00:00
# save lccn for later (if there is one) before deleting it
print_lccn = None
2014-10-14 14:08:08 +00:00
if marcfile:
record = pymarc.parse_xml_to_array(marcfile)[0]
for lccn in record.get_fields('010'):
for validlccn in lccn.get_subfields('a'):
print_lccn = validlccn
fields_to_delete = []
fields_to_delete += record.get_fields('001')
fields_to_delete += record.get_fields('003')
fields_to_delete += record.get_fields('005')
fields_to_delete += record.get_fields('006')
fields_to_delete += record.get_fields('007')
fields_to_delete += record.get_fields('010')
fields_to_delete += record.get_fields('040')
for field in fields_to_delete:
record.remove_field(field)
else:
record = pymarc.Record()
# create accession number and write 001 field
# (control field syntax is special)
if ebf:
(marc_record, created) = models.MARCRecord.objects.get_or_create(edition=edition,link_target='B2U')
else:
(marc_record, created) = models.MARCRecord.objects.get_or_create(edition=edition,link_target='UNGLUE')
field001 = pymarc.Field(tag='001', data=marc_record.accession)
record.add_ordered_field(field001)
# add field indicating record originator
field003 = pymarc.Field(tag='003', data='UnglueIt')
record.add_ordered_field(field003)
# update timestamp of record
now = datetime.now()
datestamp = now.strftime('%Y%m%d%H%M%S') + '.0'
field005 = pymarc.Field(tag='005', data=datestamp)
record.add_ordered_field(field005)
# change 006, 007, 008 because this is an online resource
field006 = pymarc.Field(
tag='006',
data='m o d '
)
record.add_ordered_field(field006)
2013-07-24 14:19:21 +00:00
field007 = pymarc.Field(
tag='007',
data='cr'
)
record.add_ordered_field(field007)
2014-10-14 14:08:08 +00:00
try:
field008 = record.get_fields('008')[0]
record.remove_field(field008)
old_field_value = field008.value()
new_field_value = old_field_value[:23] + 'o' + old_field_value[24:]
except IndexError:
# fun fun fun
new_field_value= now.strftime('%y%m%d')+'s'
if len(edition.publication_date)>3:
new_field_value += edition.publication_date[0:4]
else:
new_field_value += '||||'
2014-10-17 15:30:55 +00:00
new_field_value += '||||xx |||||o|||||||||||eng||'
field008 = pymarc.Field(tag='008', data=new_field_value)
record.add_ordered_field(field008)
# add IBSN for ebook where applicable; relegate print ISBN to $z
2013-07-22 18:23:59 +00:00
isbn = ''
try:
isbn = edition.identifiers.filter(type='isbn')[0].value
except IndexError:
pass
2013-07-26 20:12:36 +00:00
try:
field020 = record.get_fields('020')[0]
print_isbn = field020.get_subfields('a')[0]
field020.delete_subfield('a')
if isbn:
field020.add_subfield('a', isbn)
field020.add_subfield('z', print_isbn)
except IndexError:
print_isbn = None
# change 050 and 082 indicators because LOC is no longer responsible for these
# no easy indicator change function, so we'll just reconstruct the fields
try:
field050 = record.get_fields('050')[0]
field050_new = field050
field050_new.indicators = [' ', '4']
record.remove_field(field050)
record.add_ordered_field(field050_new)
except:
pass # if no 050 field, don't need to change indicator
try:
field082 = record.get_fields('082')[0]
field082_new = field082
field082_new.indicators = [' ', '4']
record.remove_field(field082)
record.add_ordered_field(field082_new)
except:
pass # if no 082 field, don't need to change indicator
# author name
try:
field100 = record.get_fields('100')[0]
except IndexError:
num_auths = edition.authors.count()
if num_auths:
field100 = pymarc.Field(
tag='100',
indicators = ['1', ' '],
subfields = [
'a', edition.authors.all()[0].last_name_first,
]
)
record.add_ordered_field(field100)
if num_auths > 1:
for auth in edition.authors.all()[1:]:
field = pymarc.Field(
tag='700',
indicators = ['1', ' '],
subfields = [
'a', auth.last_name_first,
'e', 'joint author.',
]
)
record.add_ordered_field(field)
# add subfield to 245 indicating format
2014-10-14 14:08:08 +00:00
try:
field245 = record.get_fields('245')[0]
except IndexError:
field245 = pymarc.Field(
tag='245',
indicators = ['1', '0'],
subfields = [
'a', edition.title,
]
)
record.add_ordered_field(field245)
field245.add_subfield('a', '[electronic resource]')
# publisher, date
try:
field260 = record.get_fields('260')[0]
except IndexError:
field260 = pymarc.Field(
tag='260',
indicators = [' ', ' '],
subfields = [
'b', edition.publisher_name.name,
'c', unicode(edition.publication_date),
]
)
record.add_ordered_field(field260)
# modify 300 field (physical description)
2014-10-14 14:08:08 +00:00
try:
field300 = record.get_fields('300')[0]
subfield_a = field300.get_subfields('a')[0]
if (
subfield_a[-2:] == ' ;' or
subfield_a[-2:] == ' :' or
subfield_a[-2:] == ' +'
):
subfield_a = subfield_a[:-2]
new300a = '1 online resource (' + subfield_a + ')'
if field300.get_subfields('b'):
new300a += ' :'
field300.delete_subfield('a')
field300.add_subfield('a', new300a)
field300.delete_subfield('c')
except:
pass
if license:
# add 536 field (funding information)
if edition.unglued:
funding_info = 'The book is available as a free download thanks to the generous support of interested readers and organizations, who made donations using the crowd-funding website Unglue.it.'
2013-07-29 23:44:58 +00:00
else:
2014-05-08 14:21:50 +00:00
if edition.ebooks.all()[0].rights in cc.LICENSE_LIST:
funding_info = 'The book is available as a free download thanks to a Creative Commons license.'
else:
funding_info = 'The book is available as a free download because it is in the Public Domain.'
field536 = pymarc.Field(
tag='536',
indicators = [' ', ' '],
subfields = [
'a', funding_info,
]
)
record.add_ordered_field(field536)
# add 540 field (terms governing use)
field540 = pymarc.Field(
tag='540',
indicators = [' ', ' '],
subfields = [
2014-05-08 14:21:50 +00:00
'a', dict(cc.CHOICES)[license],
'u', dict(cc.GRANTS)[license],
]
)
record.add_ordered_field(field540)
# add 588 field (source of description) - credit where credit is due
2014-09-12 21:44:41 +00:00
if print_lccn:
field588 = pymarc.Field(
tag='588',
indicators = [' ', ' '],
subfields = [
'a', 'Description based on print version record from the Library of Congress.',
]
)
record.add_ordered_field(field588)
# add 776 field (related editions) - preserve pISBN, LCCN, OCLCnum
title = record.get_fields('245')[0].get_subfields('a')[0]
title = title.split('/')[0]
try:
2013-09-23 03:41:24 +00:00
oclcnum = edition.identifiers.filter(type='oclc')[0].value
except IndexError:
oclcnum = None
2013-07-29 23:46:58 +00:00
subfields = ['i', 'Print version: ','t', title,]
if print_isbn:
subfields.extend(['z', print_isbn])
elif isbn:
subfields.extend(['z', isbn])
2014-09-12 15:49:07 +00:00
if print_lccn:
subfields.extend(['w', '(DLC) ' + print_lccn, ])
if oclcnum:
2013-07-29 23:46:58 +00:00
subfields.extend(['w', '(OCoLC) ' + oclcnum,])
field776 = pymarc.Field(
tag='776',
indicators = ['0', '8'],
subfields = subfields
)
record.add_ordered_field(field776)
"""
add 776 fields
indicators: 0 8
'$i Print version: '
$t Title. <--note space
$d is optional
$z pISBN goes here
harvest from 020 (was moved from $a to $z)
$w (DLC) LCCN_goes_here
harvest from 010 field before deletion
$w (OCoLC) OCLCnum_goes_here
harvest from identifiers db
"""
# strip any 9XX fields (they're for local use)
for i in range(900, 1000):
fields = record.get_fields(str(i))
for field in fields:
record.remove_field(field)
# add 856 fields with links for each available file
# doing this out of order as it's the only thing that differs
# between direct-link and via-unglue.it versions
if not ebf:
# need deepcopy() because omg referential transparency!
record_direct = deepcopy(record) # 2 records for unglued stuff
for format_tuple in settings.FORMATS:
format = format_tuple[0]
ebooks = edition.ebooks.filter(format=format)
if ebooks:
for book in ebooks:
field856 = pymarc.Field(
tag='856',
indicators = ['4', '0'],
subfields = [
'3', format + ' version',
'q', settings.CONTENT_TYPES[format],
'u', book.url,
]
)
record_direct.add_ordered_field(field856)
unglued_url = settings.BASE_URL_SECURE + reverse('download', args=[edition.work.id])
field856_via = pymarc.Field(
tag='856',
indicators = ['4', '0'],
subfields = [
'u', unglued_url,
]
)
record.add_ordered_field(field856_via)
if not ebf:
# this via_unglueit record needs its own accession number
field001 = record_direct.get_fields('001')[0]
record_direct.remove_field(field001)
(marc_record_direct, created) = models.MARCRecord.objects.get_or_create(edition=edition,link_target='DIRECT')
field001 = pymarc.Field(tag='001', data=marc_record_direct.accession)
2013-09-24 20:22:41 +00:00
record_direct.add_ordered_field(field001)
# write the unglued MARCxml records
xmlrecord = pymarc.record_to_xml(record_direct)
xml_file = default_storage.open(marc_record_direct.xml_record, 'w')
xml_file.write(xmlrecord)
xml_file.close()
# write the unglued .mrc records, then save to s3
mrc_file = default_storage.open(marc_record_direct.mrc_record, 'w')
writer = pymarc.MARCWriter(mrc_file)
writer.write(record_direct)
mrc_file.close()
xmlrecord = pymarc.record_to_xml(record)
xml_file = default_storage.open(marc_record.xml_record, 'w')
xml_file.write(xmlrecord)
xml_file.close()
mrc_file = default_storage.open(marc_record.mrc_record, 'w')
writer = pymarc.MARCWriter(mrc_file)
writer.write(record)
mrc_file.close()
2014-10-14 14:08:08 +00:00
return marc_record.pk