2013-07-17 14:34:01 +00:00
"""
This takes a MARCXML filename as an argument and converts it into
MARC records for the unglued edition ( in . xml and . mrc formats ) .
2013-07-17 17:03:35 +00:00
Consider it a catalogolem : http : / / commons . wikimedia . org / wiki / File : Arcimboldo_Librarian_Stokholm . jpg
2013-07-17 14:34:01 +00:00
Use the MARCXML file for the non - unglued edition from Library of Congress .
"""
import pymarc
2014-10-14 14:08:08 +00:00
import logging
2013-07-23 13:41:55 +00:00
from copy import deepcopy
2013-07-17 14:34:01 +00:00
from datetime import datetime
from StringIO import StringIO
2013-07-17 17:03:35 +00:00
from django . conf import settings
2013-07-17 14:34:01 +00:00
from django . core . files . storage import default_storage
2013-07-23 13:41:55 +00:00
from django . core . urlresolvers import reverse
2013-07-17 14:34:01 +00:00
2014-05-08 14:21:50 +00:00
import regluit . core . cc as cc
2013-07-17 14:34:01 +00:00
from regluit . core import models
2014-10-14 14:08:08 +00:00
def makestub ( edition ) :
return makemarc ( None , edition )
2013-07-26 23:52:15 +00:00
def makemarc ( marcfile , edition ) :
2013-07-17 14:34:01 +00:00
logger = logging . getLogger ( __name__ )
2013-09-23 16:39:47 +00:00
try :
license = edition . ebooks . all ( ) [ 0 ] . rights
ebf = None
except IndexError :
license = None
2014-10-14 14:08:08 +00:00
try :
ebf = edition . ebook_files . all ( ) [ 0 ]
except IndexError :
# no record if no ebooks
return None
2013-09-23 16:39:47 +00:00
logger . info ( " Making MARC records for edition %s " % edition )
2013-07-17 14:34:01 +00:00
2014-09-12 15:49:07 +00:00
# save lccn for later (if there is one) before deleting it
print_lccn = None
2014-10-14 14:08:08 +00:00
if marcfile :
record = pymarc . parse_xml_to_array ( marcfile ) [ 0 ]
for lccn in record . get_fields ( ' 010 ' ) :
for validlccn in lccn . get_subfields ( ' a ' ) :
print_lccn = validlccn
fields_to_delete = [ ]
fields_to_delete + = record . get_fields ( ' 001 ' )
fields_to_delete + = record . get_fields ( ' 003 ' )
fields_to_delete + = record . get_fields ( ' 005 ' )
fields_to_delete + = record . get_fields ( ' 006 ' )
fields_to_delete + = record . get_fields ( ' 007 ' )
fields_to_delete + = record . get_fields ( ' 010 ' )
fields_to_delete + = record . get_fields ( ' 040 ' )
for field in fields_to_delete :
record . remove_field ( field )
else :
record = pymarc . Record ( )
2013-07-25 15:09:39 +00:00
2013-07-17 14:34:01 +00:00
# create accession number and write 001 field
# (control field syntax is special)
2013-09-23 16:39:47 +00:00
if ebf :
( marc_record , created ) = models . MARCRecord . objects . get_or_create ( edition = edition , link_target = ' B2U ' )
else :
( marc_record , created ) = models . MARCRecord . objects . get_or_create ( edition = edition , link_target = ' UNGLUE ' )
2013-09-23 04:34:51 +00:00
field001 = pymarc . Field ( tag = ' 001 ' , data = marc_record . accession )
2013-07-17 14:34:01 +00:00
record . add_ordered_field ( field001 )
# add field indicating record originator
field003 = pymarc . Field ( tag = ' 003 ' , data = ' UnglueIt ' )
record . add_ordered_field ( field003 )
# update timestamp of record
now = datetime . now ( )
datestamp = now . strftime ( ' % Y % m %d % H % M % S ' ) + ' .0 '
field005 = pymarc . Field ( tag = ' 005 ' , data = datestamp )
record . add_ordered_field ( field005 )
# change 006, 007, 008 because this is an online resource
field006 = pymarc . Field (
tag = ' 006 ' ,
data = ' m o d '
)
record . add_ordered_field ( field006 )
2013-07-24 14:19:21 +00:00
2013-07-17 14:34:01 +00:00
field007 = pymarc . Field (
tag = ' 007 ' ,
data = ' cr '
)
record . add_ordered_field ( field007 )
2014-10-14 14:08:08 +00:00
try :
field008 = record . get_fields ( ' 008 ' ) [ 0 ]
record . remove_field ( field008 )
old_field_value = field008 . value ( )
new_field_value = old_field_value [ : 23 ] + ' o ' + old_field_value [ 24 : ]
except IndexError :
2014-10-16 21:14:54 +00:00
# fun fun fun
new_field_value = now . strftime ( ' % y % m %d ' ) + ' s '
if len ( edition . publication_date ) > 3 :
new_field_value + = edition . publication_date [ 0 : 4 ]
else :
new_field_value + = ' |||| '
2014-10-17 15:30:55 +00:00
new_field_value + = ' ||||xx |||||o|||||||||||eng|| '
2014-10-16 21:14:54 +00:00
field008 = pymarc . Field ( tag = ' 008 ' , data = new_field_value )
record . add_ordered_field ( field008 )
2013-07-17 17:03:35 +00:00
# add IBSN for ebook where applicable; relegate print ISBN to $z
2013-07-22 18:23:59 +00:00
isbn = ' '
try :
isbn = edition . identifiers . filter ( type = ' isbn ' ) [ 0 ] . value
except IndexError :
pass
2013-07-26 20:12:36 +00:00
try :
field020 = record . get_fields ( ' 020 ' ) [ 0 ]
print_isbn = field020 . get_subfields ( ' a ' ) [ 0 ]
field020 . delete_subfield ( ' a ' )
if isbn :
field020 . add_subfield ( ' a ' , isbn )
field020 . add_subfield ( ' z ' , print_isbn )
except IndexError :
print_isbn = None
2013-07-17 17:03:35 +00:00
# change 050 and 082 indicators because LOC is no longer responsible for these
# no easy indicator change function, so we'll just reconstruct the fields
2013-08-12 21:22:24 +00:00
try :
field050 = record . get_fields ( ' 050 ' ) [ 0 ]
field050_new = field050
field050_new . indicators = [ ' ' , ' 4 ' ]
record . remove_field ( field050 )
record . add_ordered_field ( field050_new )
except :
pass # if no 050 field, don't need to change indicator
2013-07-17 17:03:35 +00:00
2013-08-12 21:22:24 +00:00
try :
field082 = record . get_fields ( ' 082 ' ) [ 0 ]
field082_new = field082
field082_new . indicators = [ ' ' , ' 4 ' ]
record . remove_field ( field082 )
record . add_ordered_field ( field082_new )
except :
pass # if no 082 field, don't need to change indicator
2014-10-16 21:14:54 +00:00
# author name
try :
field100 = record . get_fields ( ' 100 ' ) [ 0 ]
except IndexError :
num_auths = edition . authors . count ( )
if num_auths :
field100 = pymarc . Field (
tag = ' 100 ' ,
indicators = [ ' 1 ' , ' ' ] ,
subfields = [
' a ' , edition . authors . all ( ) [ 0 ] . last_name_first ,
]
)
record . add_ordered_field ( field100 )
if num_auths > 1 :
for auth in edition . authors . all ( ) [ 1 : ] :
field = pymarc . Field (
tag = ' 700 ' ,
indicators = [ ' 1 ' , ' ' ] ,
subfields = [
' a ' , auth . last_name_first ,
' e ' , ' joint author. ' ,
]
)
record . add_ordered_field ( field )
2013-07-17 14:34:01 +00:00
# add subfield to 245 indicating format
2014-10-14 14:08:08 +00:00
try :
field245 = record . get_fields ( ' 245 ' ) [ 0 ]
except IndexError :
field245 = pymarc . Field (
tag = ' 245 ' ,
indicators = [ ' 1 ' , ' 0 ' ] ,
subfields = [
' a ' , edition . title ,
]
)
record . add_ordered_field ( field245 )
field245 . add_subfield ( ' a ' , ' [electronic resource] ' )
2014-10-16 21:14:54 +00:00
# publisher, date
try :
field260 = record . get_fields ( ' 260 ' ) [ 0 ]
except IndexError :
field260 = pymarc . Field (
tag = ' 260 ' ,
indicators = [ ' ' , ' ' ] ,
subfields = [
' b ' , edition . publisher_name . name ,
' c ' , unicode ( edition . publication_date ) ,
]
)
record . add_ordered_field ( field260 )
2013-07-17 14:34:01 +00:00
# modify 300 field (physical description)
2014-10-14 14:08:08 +00:00
try :
field300 = record . get_fields ( ' 300 ' ) [ 0 ]
subfield_a = field300 . get_subfields ( ' a ' ) [ 0 ]
if (
subfield_a [ - 2 : ] == ' ; ' or
subfield_a [ - 2 : ] == ' : ' or
subfield_a [ - 2 : ] == ' + '
) :
subfield_a = subfield_a [ : - 2 ]
new300a = ' 1 online resource ( ' + subfield_a + ' ) '
if field300 . get_subfields ( ' b ' ) :
new300a + = ' : '
field300 . delete_subfield ( ' a ' )
field300 . add_subfield ( ' a ' , new300a )
field300 . delete_subfield ( ' c ' )
except :
pass
2013-09-23 16:39:47 +00:00
if license :
# add 536 field (funding information)
if edition . unglued :
funding_info = ' The book is available as a free download thanks to the generous support of interested readers and organizations, who made donations using the crowd-funding website Unglue.it. '
2013-07-29 23:44:58 +00:00
else :
2014-05-08 14:21:50 +00:00
if edition . ebooks . all ( ) [ 0 ] . rights in cc . LICENSE_LIST :
2013-09-23 16:39:47 +00:00
funding_info = ' The book is available as a free download thanks to a Creative Commons license. '
else :
funding_info = ' The book is available as a free download because it is in the Public Domain. '
field536 = pymarc . Field (
tag = ' 536 ' ,
indicators = [ ' ' , ' ' ] ,
subfields = [
' a ' , funding_info ,
]
)
record . add_ordered_field ( field536 )
2013-07-17 14:34:01 +00:00
2013-09-23 16:39:47 +00:00
# add 540 field (terms governing use)
field540 = pymarc . Field (
tag = ' 540 ' ,
indicators = [ ' ' , ' ' ] ,
subfields = [
2014-05-08 14:21:50 +00:00
' a ' , dict ( cc . CHOICES ) [ license ] ,
' u ' , dict ( cc . GRANTS ) [ license ] ,
2013-09-23 16:39:47 +00:00
]
)
record . add_ordered_field ( field540 )
2013-07-17 14:34:01 +00:00
# add 588 field (source of description) - credit where credit is due
2014-09-12 21:44:41 +00:00
if print_lccn :
field588 = pymarc . Field (
tag = ' 588 ' ,
indicators = [ ' ' , ' ' ] ,
subfields = [
' a ' , ' Description based on print version record from the Library of Congress. ' ,
]
)
record . add_ordered_field ( field588 )
2013-07-25 15:09:39 +00:00
# add 776 field (related editions) - preserve pISBN, LCCN, OCLCnum
title = record . get_fields ( ' 245 ' ) [ 0 ] . get_subfields ( ' a ' ) [ 0 ]
title = title . split ( ' / ' ) [ 0 ]
try :
2013-09-23 03:41:24 +00:00
oclcnum = edition . identifiers . filter ( type = ' oclc ' ) [ 0 ] . value
2013-07-25 15:09:39 +00:00
except IndexError :
oclcnum = None
2013-07-29 23:46:58 +00:00
subfields = [ ' i ' , ' Print version: ' , ' t ' , title , ]
if print_isbn :
subfields . extend ( [ ' z ' , print_isbn ] )
2014-10-16 21:14:54 +00:00
elif isbn :
subfields . extend ( [ ' z ' , isbn ] )
2014-09-12 15:49:07 +00:00
if print_lccn :
subfields . extend ( [ ' w ' , ' (DLC) ' + print_lccn , ] )
2013-07-25 15:09:39 +00:00
if oclcnum :
2013-07-29 23:46:58 +00:00
subfields . extend ( [ ' w ' , ' (OCoLC) ' + oclcnum , ] )
field776 = pymarc . Field (
tag = ' 776 ' ,
indicators = [ ' 0 ' , ' 8 ' ] ,
subfields = subfields
)
2013-07-25 15:09:39 +00:00
record . add_ordered_field ( field776 )
"""
add 776 fields
indicators : 0 8
' $i Print version: '
$ t Title . < - - note space
$ d is optional
$ z pISBN goes here
harvest from 020 ( was moved from $ a to $ z )
$ w ( DLC ) LCCN_goes_here
harvest from 010 field before deletion
$ w ( OCoLC ) OCLCnum_goes_here
harvest from identifiers db
"""
2013-07-17 14:34:01 +00:00
2013-07-23 13:41:55 +00:00
# strip any 9XX fields (they're for local use)
for i in range ( 900 , 1000 ) :
fields = record . get_fields ( str ( i ) )
for field in fields :
record . remove_field ( field )
2013-07-17 14:34:01 +00:00
# add 856 fields with links for each available file
2013-07-23 13:41:55 +00:00
# doing this out of order as it's the only thing that differs
# between direct-link and via-unglue.it versions
2013-09-23 16:39:47 +00:00
if not ebf :
# need deepcopy() because omg referential transparency!
record_direct = deepcopy ( record ) # 2 records for unglued stuff
2013-07-23 13:41:55 +00:00
2013-09-23 16:39:47 +00:00
for format_tuple in settings . FORMATS :
format = format_tuple [ 0 ]
ebooks = edition . ebooks . filter ( format = format )
if ebooks :
for book in ebooks :
field856 = pymarc . Field (
tag = ' 856 ' ,
indicators = [ ' 4 ' , ' 0 ' ] ,
subfields = [
' 3 ' , format + ' version ' ,
' q ' , settings . CONTENT_TYPES [ format ] ,
' u ' , book . url ,
]
)
record_direct . add_ordered_field ( field856 )
2013-07-23 13:41:55 +00:00
unglued_url = settings . BASE_URL_SECURE + reverse ( ' download ' , args = [ edition . work . id ] )
field856_via = pymarc . Field (
tag = ' 856 ' ,
indicators = [ ' 4 ' , ' 0 ' ] ,
subfields = [
' u ' , unglued_url ,
]
)
2013-09-23 16:39:47 +00:00
record . add_ordered_field ( field856_via )
if not ebf :
# this via_unglueit record needs its own accession number
field001 = record_direct . get_fields ( ' 001 ' ) [ 0 ]
record_direct . remove_field ( field001 )
( marc_record_direct , created ) = models . MARCRecord . objects . get_or_create ( edition = edition , link_target = ' DIRECT ' )
field001 = pymarc . Field ( tag = ' 001 ' , data = marc_record_direct . accession )
2013-09-24 20:22:41 +00:00
record_direct . add_ordered_field ( field001 )
2013-07-17 14:34:01 +00:00
2013-09-23 16:39:47 +00:00
# write the unglued MARCxml records
xmlrecord = pymarc . record_to_xml ( record_direct )
xml_file = default_storage . open ( marc_record_direct . xml_record , ' w ' )
xml_file . write ( xmlrecord )
xml_file . close ( )
# write the unglued .mrc records, then save to s3
mrc_file = default_storage . open ( marc_record_direct . mrc_record , ' w ' )
writer = pymarc . MARCWriter ( mrc_file )
writer . write ( record_direct )
mrc_file . close ( )
2013-07-23 20:07:47 +00:00
2013-07-17 14:34:01 +00:00
xmlrecord = pymarc . record_to_xml ( record )
2013-09-23 04:34:51 +00:00
xml_file = default_storage . open ( marc_record . xml_record , ' w ' )
2013-07-17 14:34:01 +00:00
xml_file . write ( xmlrecord )
2013-07-18 19:49:54 +00:00
xml_file . close ( )
2013-07-23 13:41:55 +00:00
2013-09-23 04:34:51 +00:00
mrc_file = default_storage . open ( marc_record . mrc_record , ' w ' )
writer = pymarc . MARCWriter ( mrc_file )
2013-07-17 14:34:01 +00:00
writer . write ( record )
2013-07-18 19:49:54 +00:00
mrc_file . close ( )
2014-10-14 14:08:08 +00:00
return marc_record . pk