Source code for pymods.record

import collections
import re

from lxml import etree

from pymods.constants import NAMESPACES, DATE_FIELDS

__pdoc__ = {}  # for pdoc documentation - http://pdoc.burntsushi.net/pdoc

Abstract = collections.namedtuple('Abstract', 'text type displayLabel elem')
# helper statements for pdoc documentation - http://pdoc.burntsushi.net/pdoc
__pdoc__['Abstract.text'] = 'Abstract elem text value.'
__pdoc__['Abstract.type'] = 'Value of elem@type attribute.'
__pdoc__['Abstract.displayLabel'] = 'Value of elem@displayLabel attribute.'
__pdoc__['Abstract.elem'] = 'lxml.etree.Element.'

Collection = collections.namedtuple('Collection', 'location title url elem')
__pdoc__['Collection'] = 'Tuple container for archival collection information.'
__pdoc__['Collection.location'] = 'Collection location (relatedItem[@type="host"]/location/physicalLocation).'
__pdoc__['Collection.title'] = 'Collection title (relatedItem[@type="host"]/titleInfo).'
__pdoc__[
    'Collection.url'] = 'Value of relatedItem[@type="host"]/location/url, which for local practice is a link to the finding aid.'
__pdoc__['Collection.elem'] = 'lxml.etree.Element.'

Date = collections.namedtuple('Date', 'text type elem')
__pdoc__['Date.text'] = 'Date elem text value. Potentially reformatted if a date range.'
__pdoc__['Date.type'] = 'Date elem type.'
__pdoc__['Date.elem'] = 'lxml.etree.Elements.'

Genre = collections.namedtuple('Genre', 'text uri authority authorityURI elem')
__pdoc__['Genre.text'] = 'Genre elem text value.'
__pdoc__['Genre.uri'] = 'Value of elem@valueURI attribute.'
__pdoc__['Genre.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Genre.authorityURI'] = 'Value of elem@authorityURI attribute.'
__pdoc__['Genre.elem'] = 'lxml.etree.Element.'

Identifier = collections.namedtuple('Identifier', 'text type elem')
__pdoc__['Identifier.text'] = 'Identifier elem text value.'
__pdoc__['Identifier.type'] = 'Value of elem@type attribute.'
__pdoc__['Identifier.elem'] = 'lxml.etree.Element.'

Language = collections.namedtuple('Language', 'text code authority elem')
__pdoc__['Language.text'] = 'Language elem[@type="text"] value.'
__pdoc__['Language.code'] = 'Language elem[@type="code"] value.'
__pdoc__['Language.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Language.elem'] = 'lxml.etree.Element.'

Name = collections.namedtuple('Name', 'text type uri authority authorityURI role elem')
__pdoc__['Name.text'] = 'Name elem text value.'
__pdoc__['Name.type'] = 'Value of elem@type attribute.'
__pdoc__['Name.uri'] = 'Value of elem@valueURI attribute.'
__pdoc__['Name.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Name.authorityURI'] = 'Value of elem@authorityURI attribute.'
__pdoc__['Name.role'] = 'Role tuple generated from the values in elem/role/roleTerm'
__pdoc__['Name.elem'] = 'lxml.etree.Element.'

NamePart = collections.namedtuple('NamePart', 'text type elem')
__pdoc__['NamePart'] = 'Used internally to reformat name texts.'

Note = collections.namedtuple('Note', 'text type displayLabel elem')
__pdoc__['Note.text'] = 'Note elem text value.'
__pdoc__['Note.type'] = 'Value of elem@type attribute.'
__pdoc__['Note.displayLabel'] = 'Value of elem@displayLabel attribute.'
__pdoc__['Note.elem'] = 'lxml.etree.Element.'

PublicationPlace = collections.namedtuple('PublicationPlace', 'text type elem')
__pdoc__['PublicationPlace.text'] = 'Publication place elem text value.'
__pdoc__['PublicationPlace.type'] = 'Value of elem@type attribute.'
__pdoc__['PublicationPlace.elem'] = 'lxml.etree.Element.'

Rights = collections.namedtuple('Rights', 'text type uri elem')
__pdoc__['Rights.text'] = 'Rights elem text value.'
__pdoc__['Rights.type'] = 'Value of elem@type attribute.'
__pdoc__[
    'Rights.uri'] = "Value of elem@xlink:href attribute. Local practice is to store rightsstaments.org URI's in this attribute."
__pdoc__['Rights.elem'] = 'lxml.etree.Element.'

Role = collections.namedtuple('Role', 'text code authority elem')
__pdoc__['Role.text'] = 'Role elem[@type="text"] value.'
__pdoc__['Role.code'] = 'Role elem[@type="code"] value.'
__pdoc__['Role.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Role.elem'] = 'lxml.etree.Element.'

Subject = collections.namedtuple('Subject', 'text uri authority authorityURI elem')
__pdoc__['Subject.text'] = 'Text values of children of Subject elem. Potentially reformatted into LCSH order.'
__pdoc__['Subject.uri'] = 'Value of elem@valueURI attribute.'
__pdoc__['Subject.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Subject.authorityURI'] = 'Value of elem@authorityURI attribute.'
__pdoc__['Subject.elem'] = 'lxml.etree.Element.'

SubjectPart = collections.namedtuple('SubjectPart', 'text type elem')
__pdoc__['SubjectPart'] = 'Used internally to reformat subject texts.'

# Making life easier
mods = NAMESPACES['mods']


[docs]class Record(etree.ElementBase): """ Base record class. Subclass of etree.ElementBase. """ def _init(self): super(Record, self)._init()
[docs]class MODSRecord(Record): """ Class for retrieving information from documents using the MODSXML standard (http://www.loc.gov/standards/mods). Most element structures are supported. Data is returned mostly as lists of strings or lists of named tuples. When possible data is parsed from element parts and returned in typical LOC ordered strings: * {family name}, {given name}, {dates} for names. * {non-sort character} {title}: {subtitle} for titles. """ def _init(self): super(MODSRecord, self)._init() @property def abstract(self): """ Pull information from mods:abstract element(s). :return: A list of Abstract elements with text, type, and displayLabel attributes. """ return [Abstract(getattr(abstract, 'text', ''), abstract.attrib.get('type'), abstract.attrib.get('displayLabel'), abstract) for abstract in self.iterfind('./{0}abstract'.format(mods))] @property def classification(self): """ Pull information from mods:classification element(s). :return: A list of text from classification element(s). """ return [classification.text for classification in self.iterfind('./{0}classification'.format(mods))] @property def collection(self): """ Retrieve archival collection metadata from mods:relatedItem[type="host"]. :return: A Collection element with location, title, and url attributes. """ try: related_item = self.findall('./{0}relatedItem[@type="host"]'.format(mods))[0] coll_location, coll_title, coll_url = None, None, None try: coll_location = self._physical_location(related_item)[0] except IndexError: pass try: coll_title = self._title_part(related_item)[0] except IndexError: pass try: coll_url = self._url(related_item)[0] except IndexError: pass return Collection(coll_location, coll_title, coll_url, related_item) except IndexError: return None @property def dates(self): """ Constructs dates from dateIssued, dateCreated, copyrightDate, and dateOther elements. :return: List of Date elements with text and type attributes. """ try: return [Date(self._date_text(date_pair)[0], self._date_text(date_pair)[1], date_pair) for date_pair in self._date_collector(self.find('./{0}originInfo'.format(mods)))] except TypeError: return None @property def digital_origin(self): """ Get text from mods:edition element. :return: String containing digital origin information. """ try: return self.find('.//{0}digitalOrigin'.format(mods)).text except AttributeError: return None @property def doi(self): """ :return: Item's DOI or None. """ try: return self._identifier(id_type='DOI')[0].text except IndexError: return None @property def edition(self): """ Accesses mods:edition element. :return: Edition element text or None. """ try: return self.find('.//{0}edition'.format(mods)).text except AttributeError: return None @property def extent(self): """ Accesses mods:extent element. :return: A list of mods:extent texts. """ return [extent.text for extent in self.iterfind('.//{0}extent'.format(mods))] @property def form(self): """ Accesses mods:physicalDescription/mods:form element. :return: A list of mods:form texts. """ return [form.text for form in self.iterfind('./{0}physicalDescription/{0}form'.format(mods))] @property def genre(self): """ Accesses mods:genre element. :return: A list containing Genre elements with term, uri, authority, and authorityURI attributes. """ return [Genre(genre.text, genre.attrib.get('valueURI'), genre.attrib.get('authority'), genre.attrib.get('authorityURI'), genre) for genre in self.iterfind('./{0}genre'.format(mods))] @property def geographic_code(self): """ Accesses mods:geographicCode element. :return: A list of mods:geographicCode texts. """ return [geocode.text for geocode in self.iterfind('./{0}subject/{0}geographicCode'.format(mods))] @property def get_corp_names(self): """ Separates corporate names from other name types. :return: A list of corporate names. """ return sorted([name for name in self.get_names(type='corporate')]) @property def get_creators(self): """ Separates creator names from other name roles. :return: A list of creator names. """ return sorted([name for name in self.get_names(role='Creator')]) # TODO: this needs to flexible to code='cre'
[docs] def get_names(self, **kwargs): """ A customizable name query service. Subsets of all record names can be identified by type ('personal', 'corporate', etc.), name authority, or role. :param kwargs: A key, value pair of type="*", authority="*", or role="*". :return: A list of names matching query. """ if 'type' in kwargs.keys(): return [name for name in self.names if name.type == kwargs['type']] elif 'authority' in kwargs.keys(): return [name for name in self.names if name.authority == kwargs['authority']] elif 'role' in kwargs.keys(): return [name for name in self.names if name.role.text == kwargs['role']] else: raise KeyError
[docs] def get_notes(self, **kwargs): """ A customizable name query service. Subsets of all record notes can be identified by type or displayLabel. :param kwargs: A key, value pair of type="*" or displayLabel="*". :return: A list of notes matching query. """ if 'type' in kwargs.keys(): return [note for note in self.note if note.type == kwargs['type']] elif 'displayLabel' in kwargs.keys(): return [note for note in self.note if note.displayLabel == kwargs['displayLabel']] else: raise KeyError
@property def get_pers_names(self): """ Separates personal names from other name types. :return: A list of personal names. """ return sorted([name for name in self.get_names(type='personal')]) @property def identifiers(self): """ Accesses mods:identifier elements. :return: A list of identifiers. """ return self._identifier() @property def iid(self): """ A custom FSU identifier service. :return: Item's IID or None. """ try: return self._identifier(id_type='IID')[0].text except IndexError: return None @property def internet_media_type(self): """ Accesses mods:physicalDescription/mods:internetMediaType element. :return: A list of mods:internetMediaType texts. """ return [mime_type.text for mime_type in self.iterfind('./{0}physicalDescription/{0}internetMediaType'.format(mods))] @property def issuance(self): """ Accesses mods:issuance element. :return: List of mods:issuance texts. """ return [issuance.text for issuance in self.iterfind('.//{0}issuance'.format(mods))] @property def language(self): """ Accesses mods:languageTerm elements. :return: A list of Language elements with text, code, and authority attributes. """ return [Language(language.find('./{0}languageTerm[@type="text"]'.format(mods)).text, language.find('./{0}languageTerm[@type="code"]'.format(mods)).text, language.find('./{0}languageTerm[@type="text"]'.format(mods)).attrib.get('authority'), language) if len(language) > 1 else Language(None, language.find('./{0}languageTerm'.format(mods)).text, language.find('./{0}languageTerm'.format(mods)).attrib.get('authority'), language) if language.find('./{0}languageTerm'.format(mods)).text.islower() else Language(language.find('./{0}languageTerm'.format(mods)).text, None, language.find('./{0}languageTerm'.format(mods)).attrib.get('authority'), language) for language in self.iterfind('{0}language'.format(mods))] @property def names(self): """ General mods:name service. :return: A list of Name elements with text, uri, authority, and authorityURI attributes. """ return [Name(name._name_text(), name.attrib.get('type'), name.attrib.get('valueURI'), name.attrib.get('authority'), name.attrib.get('authorityURI'), name._name_role(), name) for name in self.iterfind('./{0}name'.format(mods))] @property def name_parts(self): """ Not currently implemented. :return: """ return NotImplemented # TODO: return unformatted name parts for transformation scenarios @property def note(self): """ Access mods:note elements. :return: A list containing Note elements with text, type, and displayLabel attributes. """ return [Note(note.text, note.attrib.get('type'), note.attrib.get('displayLabel'), note) for note in self.iterfind('./{0}note'.format(mods))] @property def physical_description_note(self): """ Access mods:physicalDescription/mods:note elements and return a list of text values. :return: A list of note text values. """ return [note.text for note in self.findall('./{0}physicalDescription/{0}note'.format(mods))] @property def physical_location(self): """ Access mods:mods/mods:location/mods:physicalLocation and return text values. :return: A list of element text values. """ return self._physical_location() @property def pid(self): """ Get fedora PID from MODS record. :return: Item's fedora PID or None. """ try: return self._identifier(id_type='fedora')[0].text except IndexError: return None @property def publication_place(self): """ Accesses mods:originInfo/mods:place elements. :return: A list of PublicationPlace elements with text and type attributes. """ return [PublicationPlace(place.text, place.attrib.get('type'), place) for place in self.iterfind('./{0}originInfo/{0}place/{0}placeTerm'.format(mods))] @property def publisher(self): """ Accesses mods:publisher elements. :return: A list of element text values. """ return [publisher.text for publisher in self.findall('./{0}originInfo/{0}publisher'.format(mods))] @property def purl(self): """ Retrieves record's Persistent URL from mods:mods/mods:location/mods:url. :return: List of strings. """ purl = re.compile('((http)(s)?(://purl)[\w\d:#@%/;$()~_?\+-=\\\.&]+)') return [url.text for url in self.iterfind('./{0}location/{0}url'.format(mods)) if purl.search(url.text)] @property def rights(self): """ Access mods:accessCondition and return values. :return: A list containing Rights elements with text, type, and uri. """ return [Rights(rights.text, rights.attrib.get('type'), rights.attrib.get('{http://www.w3.org/1999/xlink}href'), rights) for rights in self.iterfind('{0}accessCondition'.format(mods))] @property def subjects(self): """ General subject retrieval service. :return: list of Subject elements with text, uri, authority and authorityURI values. """ return [Subject(subject._subject_text(), subject[0].attrib.get('valueURI'), subject.attrib.get('authority'), subject.attrib.get('authorityURI'), subject) if subject.attrib.get('valueURI') is None else Subject(subject._subject_text(), subject.attrib.get('valueURI'), subject.attrib.get('authority'), subject.attrib.get('authorityURI'), subject) for subject in self.iterfind('{0}subject'.format(mods)) if 'geographicCode' not in subject[0].tag] @property def subject_parts(self): """ Not currently implemented. :return: """ return NotImplemented # TODO: return unformatted subject parts for transformation scenarios @property def table_of_contents(self): try: return [toc.text for toc in self.iterfind('{0}tableOfContents'.format(mods))] except AttributeError: return None @property def titles(self): """ General title retrieval service. :return: A list of title texts. """ return [title for title in self._title_part()] @property def title_parts(self): """ Not currently implemented. :return: """ return NotImplemented # TODO: return unformatted title parts for transformation scenarios @property def type_of_resource(self): """ Access mods:typeOfResource and return text value. :return: Text value or None. """ try: return self.find('./{0}typeOfResource'.format(mods)).text except AttributeError: return None def _date_collector(self, elem): for tag in DATE_FIELDS: try: if elem.find('./{0}'.format(tag)) is not None: return [elem.findall('./{0}'.format(tag))] except AttributeError: pass def _date_text(self, date_pair): if len(date_pair) == 1: return date_pair[0].text, date_pair[0].tag elif len(date_pair) == 2: date_list = sorted([date.text for date in date_pair]) return '{0} - {1}'.format(date_list[0], date_list[1]), date_pair[0].tag def _get_dates(self, elem): return [date for date in elem.find('./{0}originInfo'.format(mods)).iterchildren() if date.tag in DATE_FIELDS] def _get_text(self, elem): """Wrapping common use of getattr for safe attribute access.""" return getattr(elem, 'text', None) def _identifier(self, id_type=None): """ :param id_type: A MODSXML @type='id_type' attribute value. :return: A list of Identifier elements with text and type attributes. """ if id_type: return [Identifier(identifier.text, id_type, identifier) for identifier in self.iterfind('.//{0}identifier'.format(mods)) if identifier.attrib.get('type') == id_type] else: return [Identifier(identifier.text, identifier.attrib.get('type'), identifier) for identifier in self.iterfind('.//{0}identifier'.format(mods))] def _name_part(self, elem=None): if elem is None: elem = self return [NamePart(name.text, name.attrib.get('type'), name) for name in elem.iterfind('./{0}namePart'.format(mods))] def _name_role(self, elem=None): if elem is None: elem = self return Role(elem._name_role_text(), elem._name_role_code(), elem._name_role_authority(), elem) def _name_role_authority(self): try: return self.find('.//{0}roleTerm'.format(mods)).attrib.get('authority') except AttributeError: return None def _name_role_code(self): try: return self.find('.//{0}roleTerm[@type="code"]'.format(mods)).text except AttributeError: return None def _name_role_text(self): try: return self.find('.//{0}roleTerm[@type="text"]'.format(mods)).text except AttributeError: return None def _name_text(self, elem=None): if elem is None: elem = self if elem.attrib.get('type') == 'personal': family = ', '.join(x.text for x in elem._name_part() if x.type == 'family') given = ', '.join(x.text for x in elem._name_part() if x.type == 'given') terms_of_address = ', '.join(x.text for x in elem._name_part() if x.type == 'termsOfAddress') date = ', '.join(x.text for x in elem._name_part() if x.type == 'date') untyped_name = ', '.join(x.text for x in elem._name_part() if x.type is None) return '{family}{given}{termsOfAddress}{untyped_name}{date}'.format( family=family + ', ' if family else '', given=given if given else '', termsOfAddress=', ' + terms_of_address if terms_of_address else '', untyped_name=untyped_name if untyped_name else '', date=', ' + date if date else '' ) else: text = '' for part in elem.iter(tag='{0}namePart'.format(mods)): text = text + '{0}, '.format(part.text) return text.strip(', ') def _physical_location(self, elem=None): """ Access mods:mods/mods:location/mods:physicalLocation and return text values. :return: A list of text values. """ if elem is None: elem = self return [location.text for location in elem.iterfind('./{0}location/{0}physicalLocation'.format(mods))] def _subject_part(self, elem=None): if elem is None: elem = self return [SubjectPart(term._name_text(), term.tag, term) if 'name' in term.tag else SubjectPart(term.text, term.tag, term) for term in elem.iterchildren()] def _subject_text(self): subject_text = '' for subject_part in self._subject_part(): subject_text = subject_text + '{0}--'.format(subject_part.text) return subject_text.strip('--') def _title_part(self, elem=None): # TODO - name title stuff to match name&subject methods """ :param elem: The element containing a mods:titleInfo elements (i.e. mods:mods or mods:relatedItem). :return: A list of correctly formatted titles. """ if elem is None: elem = self return [self._title_text( self._get_text(title.find('./{0}nonSort'.format(mods))), self._get_text(title.find('./{0}title'.format(mods))), self._get_text(title.find('./{0}subTitle'.format(mods)))) for title in elem.iterfind('./{0}titleInfo'.format(mods))] def _title_text(self, non_sort, title, subtitle): """Construct valid title regardless if any constituent part missing.""" return '{non_sort}{title}{subtitle}'.format( non_sort=non_sort + ' ' if non_sort else '', title=title if title else '', subtitle=': ' + subtitle if subtitle else '') def _url(self, elem): return [url.text for url in elem.iterfind('./{0}location/{0}url'.format(mods))]
[docs]class OAIRecord(Record): """ Record class for records stored in the OAI-PMH format. OAI documents in either the OAI-PMH standard or repox export standard are supported. This class allows access to OAI wrapper data, such as the OAI record URN. The OAIRecord.metadata property allows access to the metadata content of the record. Standard methods from the MODSRecord and DCRecord classes can be performed on OAIRecord objects through the metadata property. Internal tests will automatically select the correct parser and class to return. """ def _init(self): super(OAIRecord, self)._init() @property def oai_urn(self): """ :return: The OAI ID as a string. """ if '{http://repox.ist.utl.pt}' in self.tag: try: return self.attrib['id'] except AttributeError: pass elif '{http://www.openarchives.org/OAI/2.0/}' in self.tag: try: return self.find('{0}header/{0}identifier'.format(NAMESPACES['oai_dc'])).text except AttributeError: pass else: try: return self.find('{}header/{}identifier').text except AttributeError: pass @property def metadata(self): """ Exposes the metadata content of an OAIRecord. :return: A reparsed root element either in the MODSRecord or DCRecord class, as appropriate. """ record_data = self.find('./{*}metadata') if record_data is not None: try: if 'mods' in record_data[0].tag: mods_parser_registration = etree.ElementDefaultClassLookup(element=MODSRecord) mods_parser = etree.XMLParser() mods_parser.set_element_class_lookup(mods_parser_registration) return etree.XML(etree.tostring(record_data[0], encoding='UTF-8').decode('utf-8'), parser=mods_parser) elif 'qualified' in record_data[0].tag: qdc_parser_registration = etree.ElementDefaultClassLookup(element=DCRecord) qdc_parser = etree.XMLParser() qdc_parser.set_element_class_lookup(qdc_parser_registration) return etree.XML(etree.tostring(record_data[0], encoding='UTF-8').decode('utf-8'), parser=qdc_parser) elif 'dc' in record_data[0].tag: dc_parser_registration = etree.ElementDefaultClassLookup(element=DCRecord) dc_parser = etree.XMLParser() dc_parser.set_element_class_lookup(dc_parser_registration) return etree.XML(etree.tostring(record_data[0], encoding='UTF-8').decode('utf-8'), parser=dc_parser) except IndexError: pass
[docs]class DCRecord(Record): """ Record class for Dublin Core and Qualified Dublin Core elements. """ def _init(self): super(DCRecord, self)._init()
[docs] def get_element(self, elem, delimiter=None): """ :param elem: An element. It can be named explicitly by namespace using Clark Notation, or using the form '{*}elem' will match elem in any namespace. :param delimiter: A character used to separate values within a single element. :return: A list of element values. """ if self.find('{0}'.format(elem)) is not None: if delimiter is None: return [item.text for item in self.findall('{0}'.format(elem)) if item.text] else: return [split_text.strip() for item in self.findall('{0}'.format(elem)) if item.text for split_text in item.text.split(delimiter)]