catalog/schema.py
author Edward Betts <edwardbetts@gmail.com>
Fri Nov 02 15:45:41 2007 +0000 (9 months ago)
changeset 228 2c01f1a1b2b8
parent 175e1bcd3c22a83
permissions -rw-r--r--
rewrite MARC parser and add subtitle to schema
        1 # a python representation of the Open Library schema
        2 # (run this to produce an html representation, or invoke
        3 # it with the arguments "template edition" to print out
        4 # template markup for an edition item.)
        5 
        6 # Following is a python datastructure representing the field-schema for
        7 # bibliographic items in ThingDB.  Where the `count` attribute is not
        8 # specified, its value is `'single'`.  The types `string`, `text`, `url` (and
        9 # perhaps `date`) may all be stored as "strings" in ThingDB, but the
       10 # distinction here may help to render those strings appropriately in the UI.
       11 
       12 schema_ordered = {
       13 
       14             'author':
       15             [
       16                     ('identifier', {
       17                         'type': 'string',
       18                         'count': 'multiple',
       19                         # 'marc_fields': ['100:abcd', '110:ab', '710:ab', '111:acdn', '711:acdn'],
       20                         'example': "Twain, Mark, 1835-1910",
       21                         'description': "unique id in some catalog" }),
       22                     ('name', { 'type': 'string', 'example': "Mark Twain", 'description': "human-readable name" }),
       23                     ('birth_date', { 'type': 'date', 'example': "1835" }),
       24                     ('death_date', { 'type': 'date', 'example': "1910" }),
       25                     ('bio', { 'type': 'text' })
       26             ],
       27 
       28             'edition':
       29             [ 
       30                     ('source_record_loc', {
       31                         'title': "Source Record Locator",
       32                         'type': 'string',
       33                         'count': 'multiple',
       34                         'example': "marc_records_scriblio_net/part01.dat:29834:543",
       35                         'description': "a locator for the source record data" }),
       36                     ('source_record_id', {
       37                         'title': "Source Record Id",
       38                         'type': 'string',
       39                         'count': 'multiple',
       40                         'example': "LC:DLC:00000006",
       41                         'description': "a record identifier that is globally unique and that also can be constructed consistently from the contents of a record and an identifier for its source catalog" }),
       42                     ('author_identifier', {
       43                         'title': "Author Identifier",
       44                         'type': 'string',
       45                         'count': 'multiple',
       46                         'marc_fields': ['100:abcd author_id', '110:ab author_id', '111:acdn author_id'],
       47                         'example': "Twain, Mark, 1835-1910",
       48                         'description': "unique author id in some catalog" }),
       49                     # ('authors', { 'type': 'id-ref', 'count': 'multiple', 'example': 'a/Mark_Twain' }),
       50                     ('contributions', {
       51                         'title': "Contributions",
       52                         'type': 'string',
       53                         'count': 'multiple',
       54                         'marc_fields': ['700:abcde', '710:ab', '711:acdn'],
       55                         'example': 'Illustrated by: Steve Bjorkman' }),
       56                     ('title', {
       57                         'title': "Title",
       58                         'type': 'string',
       59                         'marc_fields': '245:ab clean_name',
       60                         'example': 'The adventures of Tom Sawyer' }),
       61                     ('subtitle', {
       62                         'title': "Subtitle",
       63                         'type': 'string',
       64                         'count': 'multiple',
       65                     }),
       66                     ('by_statement', {
       67                         'title': "By Statement",
       68                         'type': 'string',
       69                         'count': 'multiple',
       70                         'marc_fields': '245:c',
       71                         'example': 'Herman Melville ; [illustrated by Barry Moser]' }),
       72                     ('sort_title', {
       73                         'title': "Title for sorting",
       74                         'type': 'string',
       75                         'example': 'adventures of Tom Sawyer'
       76                         }),
       77                     ('other_titles', {
       78                         'title': "Other Titles",
       79                         'type': 'string',
       80                         'count': 'multiple',
       81                         'marc_fields': ['246:a', '730:a-z', '740:apn' ],
       82                         'example': "Mark Twain's The Adventures of Tom Sawyer" }),
       83                     ('work_title', {
       84                         'title': "Work Title",
       85                         'type': 'string',
       86                         'marc_fields': ['240:amnpr', '130:a-z'],
       87                         'description': "The 240 \"work title\" is used in the OCLC FRBR algorithm. The 130 is also used, and there should be either a 130 or a 240 in a record, but not both. It would be ideal if we could pick up either for the work title." }),
       88                     ('edition', {
       89                         'title': "Edition",
       90                         'type': 'string',
       91                         'marc_fields': '250:ab',
       92                         'example': '2nd. editon',
       93                         'description': 'information about this edition' }),
       94                     ('publisher', {
       95                         'title': "Publisher",
       96                         'type': 'string',
       97                         'marc_fields': '260:b clean_name',
       98                         'example': 'W. W. Norton & Co.' }),
       99                     ('publish_place', {
      100                         'title': "Publish Place",
      101                         'type': 'string',
      102                         'count': 'multiple',
      103                         'marc_fields': '260:a clean',
      104                         'example': 'New York' }),
      105                     ('publish_date', { 
      106                         'title': "Publish Date",
      107                         'type': 'date',
      108                         'marc_fields': '008:7-10',
      109                         'example': '2006' }),
      110                     ('pagination', {
      111                         'title': "Pagination",
      112                         'type': 'string',
      113                         'marc_fields': '300:a',
      114                         'example': "viii, 383 p. :",
      115                         'description': "full pagination information" }),
      116                     ('number_of_pages', {
      117                         'title': "Number of Pages",
      118                         'type': 'int',
      119                         'example': '237',
      120                         'marc_fields': '300:a biggest_decimal',
      121                         'description': 'largest decimal found' }),
      122                     ('subjects', {
      123                         'title': "Subjects",
      124                         'type': 'string',
      125                         'count': 'multiple',
      126                         'marc_fields': ['600:abcd--x--v--y--z',
      127                                         '610:ab--x--v--y--z',
      128                                         '630:acdegnpqst--v--x--y--z',
      129                                         '650:a--x--v--y--z',
      130                                         '651:a--x--v--y--z'],
      131                         'example': 'Runaway children -- Fiction' }),
      132                     ('subject_place', {
      133                         'title': "Subject Places",
      134                         'type': 'string',
      135                         'count': 'multiple',
      136                         'marc_fields': ['651:a*', '650:z*'],
      137                         'example': "Venice (Italy)" }),
      138                     ('subject_time', {
      139                         'title': "Subject Times",
      140                         'type': 'string',
      141                         'count': 'multiple',
      142                         'marc_fields': ['600:y*', '650:y*', '651:y*'],
      143                         'example': '20th century' }),
      144                     ('genre', {
      145                         'title': "Genre",
      146                         'type': 'string',
      147                         'count': 'multiple',
      148                         'marc_fields': ['600:v*', '650:v*', '651:v*'],
      149                         'example': "Biography" }),
      150                     ('series', {
      151                         'title': "Series Information",
      152                         'type': 'string',
      153                         'count': 'multiple',
      154                         'marc_fields': ['440:av', '490:av', '830:av' ],
      155                         'example': "Oxford world's classics" }),
      156                     ('language', {
      157                         'title': "Language",
      158                         'type': 'string',
      159                         'marc_fields': '"ISO:" 008:35-37 +',
      160                         'example': 'ISO:tel',
      161                         'description': "coded or human-readable description of the text's language" }),
      162                     ('physical_format', {
      163                         'title': "Physical Format",
      164                         'type': 'string',
      165                         'count': 'multiple',
      166                         'marc_fields': '245:h' }),
      167                     ('notes', {
      168                         'title': "Notes",
      169                         'type': 'string',
      170                         'count': 'multiple',
      171                         'marc_fields': '500-599!505!520:a-z',
      172                         }),
      173                     ('description', {
      174                         'title': "Description",
      175                         'type': 'text',
      176                         'marc_fields': '520:a'
      177                         }),
      178                     ('exerpts', { 'type': 'text', 'count': 'multiple' }),
      179                     ('table_of_contents', {
      180                         'title': "Table of Contents",
      181                         'type': 'text',
      182                         'count': 'multiple',
      183                         'marc_fields': '505:art'
      184                         }),
      185                     ('cover_image', { 'type': 'url' }),
      186                     ('scan_contributor', { 'type': 'string' }),
      187                     ('scan_sponsor', { 'type': 'string' }),
      188                     ('dewey_number', {
      189                         'title': "Dewey Decimal Classification",
      190                         'type': 'string',
      191                         'count': 'multiple',
      192                         'marc_fields': '082:a',
      193                         'example': '914.3' }),
      194                     ('LC_classification', {
      195                         'title': "Library of Congress Classification",
      196                         'type': 'string',
      197                         'count': 'multiple',
      198                         'marc_fields': '050:ab',
      199                         'example': 'BJ1533.C4 L49' }),
      200                     ('ISBN', {
      201                         'type': 'string',
      202                         'count': 'multiple',
      203                         'marc_fields': ['020:a normalize_isbn', '024:a normalize_isbn'],
      204                         'example': '9780393926033',
      205                         'description': 'a 13-digit ISBN' }),
      206                     ('UCC_13', { 'type': 'string' }),
      207                     ('UPC', { 'type': 'string' }),
      208                     ('ISMN', { 'type': 'string' }),
      209                     ('DOI', { 'type': 'string' }),
      210                     ('LCCN', {
      211                         'type': 'string',
      212                         'marc_fields': '010:a normalize_lccn',
      213                         'example': "2006285320" }),
      214                     ('GTIN_14', { 'type': 'string' }),
      215                     ('oca_identifier', { 'type': 'string', 'example': 'albertgallatinja00stevrich' })
      216             ]
      217     }
      218 
      219 schema = {}
      220 for (typename, ordered_fields) in schema_ordered.iteritems ():
      221     fields = {}
      222     for (fname, fspec) in ordered_fields:
      223         fields[fname] = fspec
      224     schema[typename] = fields
      225 
      226 def print_html ():
      227         for (typename, fields) in schema_ordered.iteritems ():
      228                 print "<p><b>" + typename + "</b></p>"
      229                 print "<table border=\"1\"><tbody>"
      230                 print "<tr><th>Field</th><th>Type</th><th>MARC Fields</th><th>Example (Description)</th></tr>"
      231                 for (fname, fspec) in fields:
      232                         marc_fields = fspec.get ('marc_fields', [])
      233                         if (type (marc_fields) != list):
      234                                 marc_fields = [marc_fields]
      235                         print "<tr>"
      236                         print "<td><b>" + fname + "</b></td>"
      237                         print "<td>" + fspec['type'] + ((fspec.get ('count', "single") == "multiple" and "*") or '') + "</td>"
      238                         print "<td>" + ", ".join (marc_fields) + "</td>"
      239                         print "<td>" + ((fspec.get ('example') and '"' + fspec['example'] + '"') or '') + ((fspec.get ('description') and " <i>(" + fspec['description'] + ")</i>") or '') + "</td>"
      240                         print "</tr>"
      241                 print "</tbody></table>"
      242 
      243 def print_template (typename):
      244         print "<dl class=\"metadata\">"
      245         for (fname, fspec) in schema_ordered[typename]:
      246                 title = fspec.get ('title', fname)
      247                 description = fspec.get ('description', "")
      248                 multiple = (fspec.get ('count', "single") == "multiple")
      249 
      250                 print "$if page.d.get('%s'):" % fname
      251                 print "\t<dt title=\"%s\"><b>%s</b></dt>" % (description, title)
      252                 if multiple:
      253                         print "\t$for v in page.d.get('%s'): <dd>$v</dd>" % fname
      254                 else:
      255                         print "\t<dd>$page.%s</dd>" % fname
      256         print "</dl>"
      257                 
      258 if __name__ == "__main__":
      259         from sys import argv
      260         if len (argv) == 3:
      261                 if argv[1] == "template":
      262                         typename = argv[2]
      263                         print_template (typename)
      264         else:
      265                 print_html ()
      266