774 lines
21 KiB
Plaintext
774 lines
21 KiB
Plaintext
#!/usr/local/bin/perl -w
|
|
#
|
|
# pgrdf2marc.pl converts one or more items from the Project Gutenberg RDF
|
|
# catalog into MARC format record(s).
|
|
#
|
|
# Detailed POD-style documentation is at the end of this file.
|
|
|
|
use strict;
|
|
use Getopt::Long;
|
|
use Fcntl;
|
|
use GDBM_File;
|
|
|
|
#-----------------------------------------------------------------------
|
|
# Configurables:
|
|
|
|
# Organisation code
|
|
my $org = 'PGUSA';
|
|
|
|
my $publisher = 'Project Gutenberg';
|
|
|
|
my $marcdbpath = '/home/gbnewby/pgiso/marc/marc.db';
|
|
my $newpath = '/home/gbnewby/pgiso/marc';
|
|
my $chgpath = '/home/gbnewby/pgiso/marc';
|
|
|
|
#-----------------------------------------------------------------------
|
|
# NON-Configurables:
|
|
|
|
# ISO 639 Language Codes
|
|
|
|
# populate a hash mapping 639-1 (2-letter) codes to 639-2 (3-letter) codes
|
|
|
|
my %map639 = qw(
|
|
ab abk aa aar af afr sq alb am amh ar ara hy arm as asm
|
|
ay aym az aze ba bak eu baq bn ben bh bih bi bis be bre
|
|
bg bul my bur be bel ca cat zh chi co cos hr scr cs cze
|
|
da dan nl dut dz dzo en eng eo epo et est fo fao fj fij
|
|
fi fin fr fre fy fry gl glg ka geo de ger el gre kl kal
|
|
gn grn gu guj ha hau he heb hi hin hu hun is ice id ind
|
|
ia ina iu iku ik ipk ga gle it ita ja jpn jv jav kn kan
|
|
ks kas kk kaz km khm rw kin ky kir ko kor ku kur oc oci
|
|
lo lao la lat lv lav ln lin lt lit mk mac mg mlg ms may
|
|
ml mlt mi mao mr mar mo mol mn mon na nau ne nep no nor
|
|
or ori om orm pa pan fa per pl pol pt por ps pus qu que
|
|
rm roh ro rum rn run ru rus sm smo sg sag sa san sr scc
|
|
sh scr sn sna sd snd si sin ss ssw sk slo sl slv so som
|
|
st sot es spa su sun sw swa sv swe tl tgl tg tgk ta tam
|
|
tt tat te tel th tha bo tib ti tir to tog ts tso tn tsn
|
|
tr tur tk tuk tw twi ug uig uk ukr ur urd uz uzb vi vie
|
|
vo vol cy wel wo wol xh xho yi yid yo yor za zha zu zul
|
|
);
|
|
|
|
# input record separator is blank line
|
|
$/ = "\n\n";
|
|
|
|
# %cat is a temporary hash for the data extracted from RDF for each item.
|
|
my %cat;
|
|
|
|
my %options = ( debug => 0 );
|
|
GetOptions(\%options, 'debug!', 'help');
|
|
|
|
if ($options{'help'}) {
|
|
print qq!
|
|
pgrdf2marc.pl converts the Project Gutenberg RDF/XML format catalog into
|
|
MARC21 format records. RDF is read from STDIN, and MARC output to STDOUT.
|
|
|
|
Usage: pgrdf2marc.pl [ --help | --debug ]
|
|
|
|
--debug will dump the RDF and the MARC in text form
|
|
|
|
--help prints this message
|
|
|
|
!;
|
|
exit;
|
|
}
|
|
|
|
my %MARCDB;
|
|
# dbmopen %MARCDB, $marcdbpath, 0644 or die "Cannot open MARCDB file: $!\n";
|
|
tie %MARCDB, 'GDBM_File', $marcdbpath, O_CREAT|O_RDWR, 0644;
|
|
|
|
my $today = today();
|
|
|
|
open CHANGE, ">$chgpath/$today.mrc" or die "Cannot open CHANGE file: $!\n";
|
|
open NEW, ">$newpath/$today.mrc" or die "Cannot open NEW file: $!\n";
|
|
|
|
#-----------------------------------------------------------------------
|
|
|
|
my $rdfrecs = 0;
|
|
my $marcrecs = 0;
|
|
my $changes = 0;
|
|
my $newrecs = 0;
|
|
|
|
while (<>) {
|
|
|
|
next unless /rdf:ID="etext(\d+)"/;
|
|
|
|
$rdfrecs++;
|
|
|
|
if ($options{'debug'}) {
|
|
# dump the RDF
|
|
# print "$_\n";
|
|
}
|
|
|
|
tr/\r/ /; # remove CR (if any)
|
|
|
|
# convert commonly used character entities
|
|
s/ & / and /sg;
|
|
s/&/&/sg;
|
|
s/—/--/sg;
|
|
|
|
if (&parse_rdf($_)) {
|
|
my @trec = &build_trec();
|
|
|
|
if ($options{'debug'}) {
|
|
# dump the MARC (pretty-printed)
|
|
foreach (@trec) { print "$_\n"; }
|
|
print "\n";
|
|
|
|
} else {
|
|
|
|
my $marc = &array2marc(@trec);
|
|
my $id = $cat{id};
|
|
|
|
# compare with existing record for change file
|
|
my $old = $MARCDB{$id}; $old = '' unless $old;
|
|
my $new = $marc;
|
|
# strip everything before 008 data ... (yes, horrible kludge!)
|
|
$old =~ s/.*?PGUSA.\d{14}\.0//;
|
|
$new =~ s/.*?PGUSA.\d{14}\.0//;
|
|
|
|
# if the record has changed, save it and write a copy to the CHANGE file
|
|
unless ( $new eq $old ) {
|
|
$MARCDB{$id} = $marc;
|
|
if ($old) {
|
|
print CHANGE $marc;
|
|
$changes++;
|
|
} else {
|
|
print NEW $marc;
|
|
$newrecs++;
|
|
}
|
|
}
|
|
}
|
|
|
|
$marcrecs++;
|
|
}
|
|
}
|
|
|
|
# now write out the MARC records to files, in bundles of 1000
|
|
|
|
my $outlimit = 0;
|
|
|
|
foreach my $id ( sort {$a <=> $b} keys %MARCDB ) {
|
|
if ( $id > $outlimit ) {
|
|
close OUT if $outlimit;
|
|
$outlimit += 1000;
|
|
my $outfile = sprintf "marc/%06d.mrc", $outlimit;
|
|
open OUT, ">$outfile" or die "Cannot open $outfile, $!\n";
|
|
}
|
|
print OUT $MARCDB{$id};
|
|
}
|
|
|
|
close OUT;
|
|
|
|
close CHANGE;
|
|
|
|
#dbmclose %MARCDB;
|
|
untie %MARCDB;
|
|
|
|
print STDERR qq|
|
|
$rdfrecs RDF records processed
|
|
$marcrecs MARC records written
|
|
$changes changed MARC records
|
|
$newrecs new MARC records
|
|
|;
|
|
|
|
exit(0);
|
|
|
|
#-----------------------------------------------------------------------
|
|
|
|
sub parse_rdf {
|
|
# parse an rdf entry and store the data in our catalogue hash
|
|
|
|
# clear %cat;
|
|
%cat = ();
|
|
|
|
$cat{title} = [];
|
|
|
|
$cat{created} = '||||-||-||';
|
|
|
|
# record must have an id ...
|
|
if (/rdf:ID="etext(\d+)"/) { $cat{id} = $1; } else { return 0; }
|
|
|
|
# ... and a title ...
|
|
unless (/<dc:title.*?>(.*?)<\/dc:title>/s) {
|
|
print STDERR "$cat{id} has no title!\n";
|
|
return 0;
|
|
}
|
|
while (s/<dc:title.*?>(.*?)<\/dc:title>//s) {
|
|
$cat{title} = [ @{ $cat{title} }, &split_field($1) ];
|
|
}
|
|
|
|
if (/<dc:created.*?<rdf:value>(.*)<\/rdf:value>/) {
|
|
$cat{created} = $1;
|
|
}
|
|
|
|
#if (/<dc:language.*?<rdf:value>(.*)<\/rdf:value>/) {
|
|
if (m!<dc:language><dcterms:ISO639-2><rdf:value>(.+)</rdf:value></dcterms:ISO639-2></dc:language>!) {
|
|
$cat{language} = $1;
|
|
}
|
|
|
|
if (/<dc:type.*?<rdf:value>(.*)<\/rdf:value>/) {
|
|
$cat{type} = $1;
|
|
}
|
|
|
|
if (/<dc:rights>(.*)<\/dc:rights>/) {
|
|
$cat{rights} = $1;
|
|
}
|
|
|
|
if (/<dc:creator.*?>(.*)<\/dc:creator>/s) {
|
|
$cat{author} = [ &split_field($1) ];
|
|
}
|
|
|
|
if (/<dc:contributor.*?>(.*)<\/dc:contributor>/s) {
|
|
$cat{contributor} = [ &split_field($1) ];
|
|
}
|
|
|
|
if (/<dcterms:LCC>(.*)<\/dcterms:LCC>/s) {
|
|
$cat{call} = [ &split_field($1) ];
|
|
}
|
|
|
|
if (/<dcterms:LCSH>(.*)<\/dcterms:LCSH>/s) {
|
|
$cat{subject} = [ &split_field($1) ];
|
|
}
|
|
|
|
if (/<dc:alternative.*?>(.*)<\/dc:alternative>/is) {
|
|
my $string = $1;
|
|
$string =~ s/\s+/ /gs;
|
|
$cat{alternative} = [ &split_field($string) ];
|
|
}
|
|
|
|
if (/<dc:description.*?>(.*)<\/dc:description>/is) {
|
|
my $string = $1;
|
|
$string =~ s/\s+/ /gs;
|
|
$cat{description} = [ &split_field($string) ];
|
|
}
|
|
|
|
if (/<dc:tableOfContents.*?>(.*)<\/dc:tableOfContents>/is) {
|
|
my $string = $1;
|
|
$string =~ s/\n/ -- /gs;
|
|
$string =~ s/\s+/ /gs;
|
|
$cat{contents} = [ &split_field($string) ];
|
|
}
|
|
|
|
# experimental kludge to determine genre
|
|
# genre is the 'Literary form' specified in the 008 character pos. 33
|
|
GENRE: {
|
|
if (/other (stories|tales)/i) { $cat{genre} = 'j'; last; }
|
|
if (/(poems|poetry)/i) { $cat{genre} = 'p'; last; }
|
|
if (/essays/i) { $cat{genre} = 'e'; last; }
|
|
if (/letters/i) { $cat{genre} = 'i'; last; }
|
|
if (/plays/i) { $cat{genre} = 'd'; last; }
|
|
if (/fiction/i) { $cat{genre} = '1'; last; }
|
|
$cat{genre} = '|';
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
sub split_field {
|
|
# split a (possibly) multivalued field, returning values as an array.
|
|
my $field = shift;
|
|
# join multiple lines
|
|
$field =~ s/>\s+<//gs;
|
|
$field =~ s/"/"/g;
|
|
my @array;
|
|
if ($field =~ /rdf:/i) {
|
|
while ($field =~ s/>([^<]+)<//s) {
|
|
push @array, $1;
|
|
}
|
|
} else {
|
|
push @array, $field;
|
|
}
|
|
return @array;
|
|
}
|
|
|
|
#-----------------------------------------------------------------------
|
|
|
|
sub build_trec {
|
|
# use %cat to build a text array of the MARC record
|
|
my @trec = ();
|
|
|
|
my ($ind2, $resp, $pubyear, $temp);
|
|
my ($name, $q, $d, $role);
|
|
|
|
my $id = $cat{id};
|
|
my $created = $cat{created};
|
|
my $language = $cat{language};
|
|
my $genre = $cat{genre};
|
|
my $type = $cat{type};
|
|
my $rights = $cat{rights};
|
|
|
|
my @titles = @{ $cat{title} };
|
|
my @alternative = @{ $cat{alternative} } if $cat{alternative};
|
|
my @authors = @{ $cat{author} } if $cat{author};
|
|
my @contribs = @{ $cat{contributor} } if $cat{contributor};
|
|
my @subjects = @{ $cat{subject} } if $cat{subject};
|
|
my @call = @{ $cat{call} } if $cat{call};
|
|
my @description = @{ $cat{description} } if $cat{description};
|
|
my @contents = @{ $cat{contents} } if $cat{contents};
|
|
|
|
|
|
# created is formatted as "yyyy-mm-dd" ...
|
|
$created = '||||-||-||' unless defined $created;
|
|
|
|
# extract a publication date (year only)
|
|
$pubyear = substr $created, 0, 4;
|
|
|
|
# reformat creation date as "yymmdd" for 008
|
|
$created =~ s/-//g;
|
|
$created = substr $created, 2, 6;
|
|
# (N.B. the 008 date is the create date for the MARC record.
|
|
# We're pretending that this is the same as the etext. Otherwise,
|
|
# this could just be today's date.)
|
|
|
|
$language = 'en' if (! defined $language);
|
|
my $lang639 = $map639{$language};
|
|
$lang639 = '|||' unless defined $lang639;
|
|
|
|
unless ($type) { $type = "Electronic text"; }
|
|
if ($type =~ /(.*), (.*)/) { $type = "$2 $1"; }
|
|
|
|
# start building the marc text array
|
|
push @trec, "LDR 00568cam a22001693a 4500";
|
|
|
|
push @trec, "001 $id";
|
|
push @trec, "003 $org";
|
|
|
|
# time stamp -- this is the date the record was created, not the PG item
|
|
push @trec, sprintf "005 %s", timestamp();
|
|
|
|
push @trec, sprintf "008 %6ss%4s||||xxu|||||s|||||000 %1s %3s d",
|
|
$created, $pubyear, $genre, $lang639;
|
|
|
|
push @trec, sprintf "040 |a%s|b%s", $org, $lang639;
|
|
push @trec, "042 |adc";
|
|
|
|
foreach (@call) {
|
|
push @trec, sprintf "050 4|a%s", $_;
|
|
}
|
|
|
|
if (@authors) {
|
|
my ($name, $d, $q, $c, undef) = &munge_author( shift @authors );
|
|
unless ($name =~ /(Various|Anonymous)/i) {
|
|
$temp = sprintf "100 1 |a%s", $name;
|
|
$temp .= "|c$c" if $c;
|
|
$temp .= "|q$q" if $q;
|
|
$temp .= ",|d$d" if $d;
|
|
push @trec, $temp;
|
|
}
|
|
}
|
|
|
|
# PG records can have 1 or 2 titles
|
|
# If there are 2, the first one is a Uniform title
|
|
my $ut = $#titles;
|
|
print STDERR "$id has $ut too many titles\n" if $ut > 1;
|
|
|
|
foreach my $title (@titles) {
|
|
|
|
# if the title contains line breaks, then the first marks the start of a subtitle, and
|
|
# any others are a mistake.
|
|
$title =~ s/\n/ : /s;
|
|
$title =~ s/\n/ /gs;
|
|
|
|
$ind2 = nonfilingIndicator( $title, $language );
|
|
|
|
if ( $ut ) { # Uniform title
|
|
|
|
$title =~
|
|
s{ (Catalan|Czech|Dutch|English|Esperanto|Finnish|French|German|Greek|Polish|Portuguese|Romany|Spanish)$}
|
|
{|l$1};
|
|
push @trec, sprintf "240 1%1d|a%s", $ind2, $title;
|
|
$ut = 0;
|
|
|
|
} else { # main title
|
|
|
|
$title =~ s/Other Stories/other stories/i;
|
|
for ($title) {
|
|
my ($a, $b) = split /:/, $title, 2;
|
|
if ( defined $b ) {
|
|
$b =~ s/^\s+//;
|
|
if ( $b =~ s/^(and|or),{0,1} /$1, / ) {
|
|
$title = "$a ;|b$b";
|
|
} elsif ($b) {
|
|
$title = "$a :|b$b";
|
|
}
|
|
}
|
|
}
|
|
$title =~ s/\s+/ /g;
|
|
$resp = &resp_stmt();
|
|
if ($resp) {
|
|
push @trec, sprintf
|
|
"245 1%1d|a%s|h[electronic resource] /|c%s", $ind2, $title, $resp;
|
|
} else {
|
|
push @trec, sprintf
|
|
"245 1%1d|a%s|h[electronic resource]", $ind2, $title;
|
|
}
|
|
}
|
|
}
|
|
|
|
push @trec, "260 |b$publisher,|c$pubyear";
|
|
#push @trec, "500 |aProject Gutenberg";
|
|
|
|
# Description
|
|
foreach (@description) {
|
|
push @trec, sprintf "500 |a%s", $_;
|
|
}
|
|
|
|
# Contents note
|
|
if (@contents) {
|
|
if ($#contents > 0) { # more than one element in contents!
|
|
push @trec, sprintf "505 0 |a%s", (join '--', @contents);
|
|
} else {
|
|
push @trec, sprintf "505 0 |a%s", $contents[0];
|
|
}
|
|
}
|
|
|
|
# Rights management / copyright statement
|
|
if ($rights) {
|
|
push @trec, "506 |a$rights";
|
|
} else {
|
|
push @trec, "506 |aFreely available.";
|
|
}
|
|
push @trec, "516 |a$type";
|
|
|
|
# Subject headings
|
|
# Note: we're using the 653 "uncontrolled terms" field, not LCSH
|
|
foreach (@subjects) {
|
|
push @trec, sprintf "653 |a%s", $_;
|
|
}
|
|
|
|
foreach (@authors) {
|
|
my ($name, $d, $q, $c, $role) = &munge_author( $_ );
|
|
$temp = "700 1 |a$name";
|
|
$temp .= "|c$c" if $c;
|
|
$temp .= "|q$q" if $q;
|
|
$temp .= ",|d$d" if $d;
|
|
$temp .= ",|e$role" if $role;
|
|
push @trec, $temp;
|
|
}
|
|
|
|
foreach (@contribs) {
|
|
my ($name, $d, $q, $c, $role) = &munge_author( $_ );
|
|
$temp = "700 1 |a$name";
|
|
$temp .= "|c$c" if $c;
|
|
$temp .= "|q$q" if $q;
|
|
$temp .= ",|d$d" if $d;
|
|
$temp .= ",|e$role" if $role;
|
|
push @trec, $temp;
|
|
}
|
|
|
|
foreach (@alternative) {
|
|
push @trec, sprintf "740 0 |a%s", $_;
|
|
}
|
|
|
|
push @trec, sprintf "830 0|aProject Gutenberg|v%d", $id;
|
|
push @trec, sprintf "856 40|uhttp://www.gutenberg.org/ebooks/%d", $id;
|
|
|
|
push @trec, "856 42|uhttp://www.gutenberg.org/license|3Rights"
|
|
unless $rights;
|
|
|
|
return @trec;
|
|
}
|
|
|
|
sub nonfilingIndicator {
|
|
# set non-filing indicator for title.
|
|
my ($title, $language) = @_;
|
|
|
|
my $ind2 = 0;
|
|
ARTICLE_CASE: {
|
|
if ($language eq 'en') { # English
|
|
$ind2 = 4 if $title =~ /^The /;
|
|
$ind2 = 3 if $title =~ /^An /;
|
|
$ind2 = 2 if $title =~ /^A /;
|
|
last;
|
|
}
|
|
if ($language eq 'fr') { # French
|
|
$ind2 = 4 if $title =~ /^(Les|Une) /;
|
|
$ind2 = 3 if $title =~ /^(Un|Le|La) /;
|
|
$ind2 = 2 if $title =~ /^L'/;
|
|
last;
|
|
}
|
|
if ($language eq 'de') { # German
|
|
$ind2 = 5 if $title =~ /^Eine /;
|
|
$ind2 = 4 if $title =~ /^(Ein|Der|Das|Die) /;
|
|
last;
|
|
}
|
|
if ($language eq 'du') { # Dutch
|
|
$ind2 = 4 if $title =~ /^(Een|Ene|Het) /;
|
|
$ind2 = 3 if $title =~ /^De /;
|
|
last;
|
|
}
|
|
if ($language eq 'es') { # Spanish
|
|
$ind2 = 4 if $title =~ /^(Las|Los) /;
|
|
$ind2 = 3 if $title =~ /^El /;
|
|
last;
|
|
}
|
|
if ($language eq 'it') { # Italian
|
|
$ind2 = 3 if $title =~ /^Il /;
|
|
last;
|
|
}
|
|
if ($language eq 'pt') { # Portuguese
|
|
$ind2 = 4 if $title =~ /^(Uma) /;
|
|
$ind2 = 3 if $title =~ /^(As|Os|Um) /;
|
|
$ind2 = 2 if $title =~ /^(A|O) /;
|
|
last;
|
|
}
|
|
}
|
|
return $ind2;
|
|
}
|
|
|
|
sub munge_author {
|
|
my $name = shift;
|
|
my ($d, $q, $c, $role);
|
|
|
|
# extract and discard role -- between []
|
|
if ($name =~ s/ \[([^\]]+)\]//) { $role = "\L$1\E"; }
|
|
|
|
# extract the dates (if any) and discard from name
|
|
# dates are assumed as anything starting with a digit
|
|
if ($name =~ s/, ([1-9-].+)//) { $d = $1; }
|
|
|
|
# extract and discard any expanded forenames -- these will be in ()
|
|
if ($name =~ s/ (\(.+\))//) { $q = $1; }
|
|
|
|
# extract and discard title
|
|
if ($name =~ s/ (Sir|Lord|Mrs|Rev|Saint|Dr|Jr)\b\.{0,1}//) { $c = $1; }
|
|
|
|
return ($name, $d, $q, $c, $role);
|
|
}
|
|
|
|
sub resp_stmt {
|
|
# generate a statement of responsibility from the author fields
|
|
my ($author, $name, $role, $resp);
|
|
my @authors = @{ $cat{author} } if $cat{author};
|
|
my @contribs = @{ $cat{contributor} } if $cat{contributor};
|
|
$resp = '';
|
|
|
|
# first author ...
|
|
$author = shift @authors;
|
|
if ($author) {
|
|
($name, undef) = munge_author( $author );
|
|
if ($name =~ /(.*?), (.*)/) { $name = "$2 $1"; }
|
|
$resp = "$name";
|
|
}
|
|
# followed by any additional authors ...
|
|
while ($author = shift @authors) {
|
|
($name, undef) = munge_author( $author );
|
|
if ($name =~ /(.*?), (.*)/) { $name = "$2 $1"; }
|
|
if (@authors) {
|
|
$resp .= ", $name";
|
|
} else {
|
|
$resp .= " and $name";
|
|
}
|
|
}
|
|
# followed by contributors ...
|
|
foreach $author (@contribs) {
|
|
($name, undef, undef, undef, $role) = munge_author( $author );
|
|
next unless $role;
|
|
if ($name =~ /(.*?), (.*)/) { $name = "$2 $1"; }
|
|
while (defined $role) {
|
|
if ($role =~ /edit/i) { $resp .= "; edited by $name"; last; }
|
|
if ($role =~ /Trans/i) { $resp .= "; translated by $name"; last; }
|
|
if ($role =~ /Illus/i) { $resp .= "; illustrated by $name"; last; }
|
|
#$resp .= "; $name";
|
|
#if ($role) { $resp .= " ($role)"; }
|
|
last; # only one pass thru required!
|
|
}
|
|
}
|
|
$resp =~ s/\s+/ /g;
|
|
return $resp;
|
|
}
|
|
|
|
#-----------------------------------------------------------------------
|
|
|
|
sub today {
|
|
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
|
|
return sprintf "%4d-%02d-%02d.%02d%02d", $year+1900, $mon+1, $mday, $hour, $min;
|
|
}
|
|
|
|
sub timestamp {
|
|
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
|
|
return sprintf "%4d%02d%02d%02d%02d%02d.0", $year+1900, $mon+1, $mday, $hour, $min, $sec;
|
|
}
|
|
|
|
#-----------------------------------------------------------------------
|
|
|
|
sub array2marc {
|
|
my @trec = @_;
|
|
|
|
# initialise stuff
|
|
my $offset = 0;
|
|
my $dir = '';
|
|
my $data = '';
|
|
|
|
# default pattern for leader
|
|
my $ldrpat = "%05dnas 22%05duu 4500";
|
|
|
|
my ($line, $field, $tag, $fldlen, $base);
|
|
|
|
# if a leader is included, build the pattern from that ...
|
|
if ( $trec[0] =~ /^LDR/ ) { # leader codes
|
|
$line = shift(@trec);
|
|
# use the leader to create a pattern for building the leader later on
|
|
# only the RS, MC, BLC, EL, DCC and Linked are used
|
|
$ldrpat = '%05d'.substr($line,9,5).'22%05d'.substr($line,21,3).'4500';
|
|
}
|
|
|
|
# process all the tags in sequence
|
|
foreach $line ( @trec ) {
|
|
|
|
# build the directory and data portions
|
|
$tag = substr($line, 0, 3);
|
|
$field = substr($line, 4); # get the data for the tag
|
|
unless ($tag lt '010') {
|
|
$field =~ tr/\|/\037/s; # change subfield delimiter(s)
|
|
}
|
|
$field =~ s/$/\036/; # append a field terminator
|
|
$fldlen = length($field);
|
|
$dir .= sprintf("%3s%04d%05d",$tag,$fldlen,$offset);
|
|
$offset += $fldlen;
|
|
$data .= $field;
|
|
}
|
|
|
|
# append a field terminator to the directory
|
|
$dir =~ s/$/\036/;
|
|
|
|
# append the record terminator
|
|
$data =~ s/$/\035/;
|
|
|
|
# compute lengths
|
|
$base = length($dir) + 24; # base address of data
|
|
my $lrl = $base + length($data); # logical record length
|
|
|
|
# return the complete MARC record
|
|
return (sprintf $ldrpat,$lrl,$base) # leader
|
|
. $dir # directory
|
|
. $data; # data
|
|
|
|
}
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
pgrdf2marc.pl
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
pgrdf2marc.pl converts one or more items from the Project Gutenberg RDF
|
|
catalog into MARC format record(s).
|
|
|
|
The RDF is read from STDIN, and the MARC output to STDOUT.
|
|
|
|
Dublin Core tags used in the RDF are:
|
|
|
|
dc:title
|
|
dc:alternative
|
|
dc:creator
|
|
dc:contributor
|
|
dc:tableOfContents
|
|
dc:publisher
|
|
dc:rights
|
|
dc:language
|
|
dc:created
|
|
dc:type
|
|
dcterms:LCSH
|
|
dcterms:LCC
|
|
|
|
|
|
A MARC record is simply an ASCII string of arbitrary length.
|
|
|
|
=head2 MARC record structure
|
|
|
|
Leader: start: 0 length: 24
|
|
Base Address (start of data): start: 12 length: 5
|
|
Directory: start: 24, length: (base - 24)
|
|
Tag number: 3 bytes
|
|
data length: 4 bytes
|
|
data offset: 5 bytes
|
|
|
|
Subfields begin with 0x1f
|
|
Fields end with 0x1e
|
|
Records end with 0x1d
|
|
|
|
=head2 Array element structure
|
|
|
|
The conversion process makes use of a simple array structure,
|
|
where each array element contains the tag and data for a single MARC
|
|
field, separated by a single space.
|
|
|
|
cols. 0-2 : tag number
|
|
col. 3 : blank
|
|
cols. 4-5 : indicators
|
|
cols. 6- : tag data
|
|
|
|
e.g.
|
|
|
|
245 10|aSome title|h[GMD]
|
|
|
|
The '|' character is used to represent MARC subfield separators (0x1f).
|
|
|
|
=head1 REFERENCES
|
|
|
|
MARC Standards, http://www.loc.gov/marc/
|
|
|
|
Dublin Core/MARC/GILS Crosswalk, http://www.loc.gov/marc/dccross.html
|
|
|
|
=head1 VERSION
|
|
|
|
Version 2011-01-15
|
|
|
|
=head1 AUTHOR
|
|
|
|
Steve Thomas <stephen.thomas@adelaide.edu.au>
|
|
|
|
=head1 LICENCE
|
|
|
|
Copyright (c) 2004-2011 Steve Thomas <stephen.thomas@adelaide.edu.au>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a
|
|
copy of this software and associated documentation files (the
|
|
"Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included
|
|
in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
=cut
|
|
|
|
__END__
|
|
|
|
|
|
<pgterms:etext rdf:ID="etext12765">
|
|
<dc:publisher>&pg;</dc:publisher>
|
|
<dc:title rdf:parseType="Literal">Geordie's Tryst
|
|
A Tale of Scottish Life</dc:title>
|
|
<dc:creator rdf:parseType="Literal">Rae, Mrs. Milne</dc:creator>
|
|
<pgterms:friendlytitle rdf:parseType="Literal">Geordie's Tryst by Mrs. Milne Rae</pgterms:friendlytitle>
|
|
<dc:language><dcterms:ISO639-2><rdf:value>en</rdf:value></dcterms:ISO639-2></dc:language>
|
|
<dc:created><dcterms:W3CDTF><rdf:value>2004-06-28</rdf:value></dcterms:W3CDTF></dc:created>
|
|
<dc:rights rdf:resource="&lic;" />
|
|
</pgterms:etext>
|
|
|
|
#!/usr/bin/perl -w
|
|
|
|
use Fcntl;
|
|
use GDBM_File;
|
|
use NDBM_File;
|
|
tie %newhash, 'GDBM_File', 'newdb', O_CREAT|O_RDWR, 0644;
|
|
tie %oldhash, 'NDBM_File', "marcdb", 1, 0;
|
|
%newhash = %oldhash;
|
|
untie %newhash;
|
|
untie %oldhash;
|