pgmarc/marc3f.py

383 lines
11 KiB
Python

import re
import pymarc
from pymarc import Subfield, Record, Field, MARCWriter
from datetime import datetime
from libgutenberg import GutenbergDatabase
from libgutenberg.DublinCoreMapping import DublinCoreObject
from os.path import join
def stub(dc):
record = pymarc.Record()
now = datetime.now()
# c - Corrected or revised, a - Language material, m - Monograph/Item, 3 - Abbreviated level, u - Unknown
record.leader[5] = 'c'
record.leader[6] = 'a'
record.leader[7] = 'm'
record.leader[17] = '3'
record.leader[18] = 'u'
field001 = pymarc.Field(tag='001', data=str(dc.project_gutenberg_id))
record.add_ordered_field(field001)
field003 = pymarc.Field(tag='003', data='UtSlPG')
record.add_ordered_field(field003)
# m - Computer file/Electronic resource - Coded data elements relating to either a computer file or an electronic resource in form.
field006 = pymarc.Field(tag='006', data='m')
record.add_ordered_field(field006)
# c - Electronic resource, r - Remote, n - Not applicable
field007 = pymarc.Field(tag='007', data='cr n')
record.add_ordered_field(field007)
# 008 in looking at pub date some have a 906 others have a 4 digit year in 260. Have to right an expression to capture that. For position 23 could be o for online or s for electronic. May have to not code for language. Because database is not coded for MARC lang codes only for ISO639-1--use MARCtag041 instead. Position 39 cataloging source d - Other.
new_field_value = now.strftime('%y%m%d') + 's||||||||xx |||||o|||||||||||||| d'
match_found = False
for att in dc.book.attributes:
if (att.fk_attriblist == 906 and att.fk_attriblist is not None) or (att.fk_attriblist == 260 and re.search(r'\b\d{4}\b', str(att.fk_attriblist))):
new_field_value = now.strftime('%y%m%d') + 's' + str(att.text) + '||||||||xx |||||o|||||||||||||| d'
match_found = True
break
if not match_found:
new_field_value = now.strftime('%y%m%d') + '|||||||||xx |||||o|||||||||||||| d'
field008 = pymarc.Field(tag='008', data=new_field_value)
record.add_ordered_field(field008)
for att in dc.book.attributes:
if att.fk_attriblist == 10:
field010 = pymarc.Field(
tag='010',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field010)
field040 = pymarc.Field(
tag='040',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value='UtSlPG'),
]
)
record.add_ordered_field(field040)
if len(dc.languages):
field041 = pymarc.Field(
tag='041',
indicators=[' ', '7'],
subfields=[
Subfield(code='a', value=str(lang.id)) for lang in dc.languages
] + [
Subfield(code='2', value='iso639-1')
]
)
record.add_ordered_field(field041)
for att in dc.book.attributes:
if att.fk_attriblist == 240:
field240 = pymarc.Field(
tag='240',
indicators=['1', str(att.nonfiling)],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field240)
for att in dc.book.attributes:
if att.fk_attriblist == 246:
field246 = pymarc.Field(
tag='246',
indicators=['1', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field246)
for att in dc.book.attributes:
if att.fk_attriblist == 250:
field250 = pymarc.Field(
tag='250',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field250)
for att in dc.book.attributes:
if att.fk_attriblist == 300:
field300 = pymarc.Field(
tag='300',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field300)
for att in dc.book.attributes:
if att.fk_attriblist == 440:
field490 = pymarc.Field(
tag='490',
indicators=['1', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field490)
for att in dc.book.attributes:
if att.fk_attriblist == 440:
field830 = pymarc.Field(
tag='830',
indicators=[' ', '0'],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field830)
# need to replace carriage returns. Tag 500 has multiple lines.
for att in dc.book.attributes:
if att.fk_attriblist == 500:
field500 = pymarc.Field(
tag='500',
indicators=[' ', " "],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field500)
for att in dc.book.attributes:
if att.fk_attriblist == 505:
field505 = pymarc.Field(
tag='505',
indicators=['0', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field505)
for att in dc.book.attributes:
if att.fk_attriblist == 508:
field508 = pymarc.Field(
tag='508',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field508)
for att in dc.book.attributes:
if att.fk_attriblist == 904:
field856 = pymarc.Field(
tag='856',
indicators=['4', '0'],
subfields=[
Subfield(code='a', value=f"https://www.gutenberg.org/ebooks/{str(dc.project_gutenberg_id)}"),
]
)
record.add_ordered_field(field856)
for att in dc.book.attributes:
if att.fk_attriblist == 904:
field856 = pymarc.Field(
tag='856',
indicators=['4', ' '],
subfields=[
Subfield(code='a', value=str(att.text)),
]
)
record.add_ordered_field(field856)
# Author name
num_auths = len(dc.authors)
if num_auths:
field100 = pymarc.Field(
tag='100',
indicators=['1', ' '],
subfields=[
Subfield(code='a', value=dc.format_author_date(dc.authors[0])) # Can do better
]
)
record.add_ordered_field(field100)
if num_auths > 1:
for auth in dc.authors[1:]:
field = pymarc.Field(
tag='700',
indicators=['1', ' '],
subfields=[
Subfield(code='a', value=dc.format_author_date(auth)),
Subfield(code='e', value='joint author.'),
]
)
record.add_ordered_field(field)
# Add Subfield to 245 indicating format
for att in dc.book.attributes:
if att.fk_attriblist == 245:
if '\n'in dc.title:
field245 = pymarc.Field(
tag='245',
indicators=['1', str(att.nonfiling)],
subfields=[
Subfield(code='a', value=dc.title_no_subtitle),
Subfield(code='h', value='[electronic resource] :'),
Subfield(code='b', value=re.sub(r'^[^\n]*\n', '', dc.title).replace('\n', ' ')),
]
)
else:
for att in dc.book.attributes:
if att.fk_attriblist == 245:
field245 = pymarc.Field(
tag='245',
indicators=['1', str(att.nonfiling)],
subfields=[
Subfield(code='a', value=dc.title_no_subtitle),
Subfield(code='h', value='[electronic resource]'),
]
)
record.add_ordered_field(field245)
# Publisher, date
if att.fk_attriblist == 260:
field260 = pymarc.Field(
tag='260',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value=f"{dc.pubinfo.place} :"),
Subfield(code='b', value=f"{dc.pubinfo.publisher},"),
Subfield(code='c', value=str(dc.pubinfo.years).replace('[(\'copyright\', \'', 'c').replace('\'), (\'pubdate\', \'', ', ').replace('\'), (\'copyright\', \'', ', c').replace('\')]', '.')),
]
)
record.add_ordered_field(field260)
add_license(record, dc)
return record
def add_license(record, dc):
if dc.rights:
# Add 540 field (terms governing use)
field540 = pymarc.Field(
tag='540',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', value=dc.rights),
]
)
record.add_ordered_field(field540)
def add_subject(record, dc):
if dc.subjects:
field653 = pymarc.Field(
tag='653',
indicators=[' ', ' '],
subfields=[
Subfield(code='a', data=dc.subjects),
]
)
record.add_ordered_field(field653)
# Generate 100 records
all_records = [] # Create a list to store all records
for i in range(100):
booknums = list(range(1, 101)) # Replace with your actual book numbers
dc = DublinCoreObject()
dc.load_from_database(booknums[i])
record = stub(dc)
all_records.append(record) # Append each record to the list
# Write all records to one file
with open("out/combined_output.txt100f", "w") as text_file:
for record in all_records:
text_file.write(str(record) + "\n") # Separate records with a newline
print("Combined records written to combined_output.txt")
# Generate 100 records
all_records = [] # Create a list to store all records
for i in range(100):
booknums = list(range(68995, 69195)) # Replace with your actual book numbers
dc = DublinCoreObject()
dc.load_from_database(booknums[i])
record = stub(dc)
all_records.append(record) # Append each record to the list
# Write all records to one file
with open("out/combined_output.txt69000f", "w") as text_file:
for record in all_records:
text_file.write(str(record) + "\n") # Separate records with a newline
print("Combined records written to combined_output.txt")
all_records = [] # Create a list to store all records
for i in range(100):
booknums = list(range(68995, 69195)) # Replace with your actual book numbers
dc = DublinCoreObject()
dc.load_from_database(booknums[i])
record = stub(dc)
all_records.append(record) # Append each record to the list
# Write all records to one MARC file
with open("out/combined_output.mrc", "wb") as marc_file:
writer = MARCWriter(marc_file)
for record in all_records:
writer.write(record)
writer.close()
print("Combined records written to combined_output.mrc")