import collections
import re
from lxml import etree
from pymods.constants import NAMESPACES, DATE_FIELDS
__pdoc__ = {} # for pdoc documentation - http://pdoc.burntsushi.net/pdoc
Abstract = collections.namedtuple('Abstract', 'text type displayLabel elem')
# helper statements for pdoc documentation - http://pdoc.burntsushi.net/pdoc
__pdoc__['Abstract.text'] = 'Abstract elem text value.'
__pdoc__['Abstract.type'] = 'Value of elem@type attribute.'
__pdoc__['Abstract.displayLabel'] = 'Value of elem@displayLabel attribute.'
__pdoc__['Abstract.elem'] = 'lxml.etree.Element.'
Collection = collections.namedtuple('Collection', 'location title url elem')
__pdoc__['Collection'] = 'Tuple container for archival collection information.'
__pdoc__['Collection.location'] = 'Collection location (relatedItem[@type="host"]/location/physicalLocation).'
__pdoc__['Collection.title'] = 'Collection title (relatedItem[@type="host"]/titleInfo).'
__pdoc__[
'Collection.url'] = 'Value of relatedItem[@type="host"]/location/url, which for local practice is a link to the finding aid.'
__pdoc__['Collection.elem'] = 'lxml.etree.Element.'
Date = collections.namedtuple('Date', 'text type elem')
__pdoc__['Date.text'] = 'Date elem text value. Potentially reformatted if a date range.'
__pdoc__['Date.type'] = 'Date elem type.'
__pdoc__['Date.elem'] = 'lxml.etree.Elements.'
Genre = collections.namedtuple('Genre', 'text uri authority authorityURI elem')
__pdoc__['Genre.text'] = 'Genre elem text value.'
__pdoc__['Genre.uri'] = 'Value of elem@valueURI attribute.'
__pdoc__['Genre.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Genre.authorityURI'] = 'Value of elem@authorityURI attribute.'
__pdoc__['Genre.elem'] = 'lxml.etree.Element.'
Identifier = collections.namedtuple('Identifier', 'text type elem')
__pdoc__['Identifier.text'] = 'Identifier elem text value.'
__pdoc__['Identifier.type'] = 'Value of elem@type attribute.'
__pdoc__['Identifier.elem'] = 'lxml.etree.Element.'
Language = collections.namedtuple('Language', 'text code authority elem')
__pdoc__['Language.text'] = 'Language elem[@type="text"] value.'
__pdoc__['Language.code'] = 'Language elem[@type="code"] value.'
__pdoc__['Language.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Language.elem'] = 'lxml.etree.Element.'
Name = collections.namedtuple('Name', 'text type uri authority authorityURI role elem')
__pdoc__['Name.text'] = 'Name elem text value.'
__pdoc__['Name.type'] = 'Value of elem@type attribute.'
__pdoc__['Name.uri'] = 'Value of elem@valueURI attribute.'
__pdoc__['Name.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Name.authorityURI'] = 'Value of elem@authorityURI attribute.'
__pdoc__['Name.role'] = 'Role tuple generated from the values in elem/role/roleTerm'
__pdoc__['Name.elem'] = 'lxml.etree.Element.'
NamePart = collections.namedtuple('NamePart', 'text type elem')
__pdoc__['NamePart'] = 'Used internally to reformat name texts.'
Note = collections.namedtuple('Note', 'text type displayLabel elem')
__pdoc__['Note.text'] = 'Note elem text value.'
__pdoc__['Note.type'] = 'Value of elem@type attribute.'
__pdoc__['Note.displayLabel'] = 'Value of elem@displayLabel attribute.'
__pdoc__['Note.elem'] = 'lxml.etree.Element.'
PublicationPlace = collections.namedtuple('PublicationPlace', 'text type elem')
__pdoc__['PublicationPlace.text'] = 'Publication place elem text value.'
__pdoc__['PublicationPlace.type'] = 'Value of elem@type attribute.'
__pdoc__['PublicationPlace.elem'] = 'lxml.etree.Element.'
Rights = collections.namedtuple('Rights', 'text type uri elem')
__pdoc__['Rights.text'] = 'Rights elem text value.'
__pdoc__['Rights.type'] = 'Value of elem@type attribute.'
__pdoc__[
'Rights.uri'] = "Value of elem@xlink:href attribute. Local practice is to store rightsstaments.org URI's in this attribute."
__pdoc__['Rights.elem'] = 'lxml.etree.Element.'
Role = collections.namedtuple('Role', 'text code authority elem')
__pdoc__['Role.text'] = 'Role elem[@type="text"] value.'
__pdoc__['Role.code'] = 'Role elem[@type="code"] value.'
__pdoc__['Role.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Role.elem'] = 'lxml.etree.Element.'
Subject = collections.namedtuple('Subject', 'text uri authority authorityURI elem')
__pdoc__['Subject.text'] = 'Text values of children of Subject elem. Potentially reformatted into LCSH order.'
__pdoc__['Subject.uri'] = 'Value of elem@valueURI attribute.'
__pdoc__['Subject.authority'] = 'Value of elem@authority attribute.'
__pdoc__['Subject.authorityURI'] = 'Value of elem@authorityURI attribute.'
__pdoc__['Subject.elem'] = 'lxml.etree.Element.'
SubjectPart = collections.namedtuple('SubjectPart', 'text type elem')
__pdoc__['SubjectPart'] = 'Used internally to reformat subject texts.'
# Making life easier
mods = NAMESPACES['mods']
[docs]class Record(etree.ElementBase):
"""
Base record class. Subclass of etree.ElementBase.
"""
def _init(self):
super(Record, self)._init()
[docs]class MODSRecord(Record):
"""
Class for retrieving information from documents using the
MODSXML standard (http://www.loc.gov/standards/mods).
Most element structures are supported. Data is returned mostly
as lists of strings or lists of named tuples. When possible
data is parsed from element parts and returned in typical
LOC ordered strings:
* {family name}, {given name}, {dates} for names.
* {non-sort character} {title}: {subtitle} for titles.
"""
def _init(self):
super(MODSRecord, self)._init()
@property
def abstract(self):
"""
Pull information from mods:abstract element(s).
:return: A list of Abstract elements with text, type, and displayLabel attributes.
"""
return [Abstract(getattr(abstract, 'text', ''),
abstract.attrib.get('type'),
abstract.attrib.get('displayLabel'),
abstract)
for abstract in self.iterfind('./{0}abstract'.format(mods))]
@property
def classification(self):
"""
Pull information from mods:classification element(s).
:return: A list of text from classification element(s).
"""
return [classification.text
for classification in self.iterfind('./{0}classification'.format(mods))]
@property
def collection(self):
"""
Retrieve archival collection metadata from mods:relatedItem[type="host"].
:return: A Collection element with location, title, and url attributes.
"""
try:
related_item = self.findall('./{0}relatedItem[@type="host"]'.format(mods))[0]
coll_location, coll_title, coll_url = None, None, None
try:
coll_location = self._physical_location(related_item)[0]
except IndexError:
pass
try:
coll_title = self._title_part(related_item)[0]
except IndexError:
pass
try:
coll_url = self._url(related_item)[0]
except IndexError:
pass
return Collection(coll_location, coll_title, coll_url, related_item)
except IndexError:
return None
@property
def dates(self):
"""
Constructs dates from dateIssued, dateCreated, copyrightDate, and dateOther elements.
:return: List of Date elements with text and type attributes.
"""
try:
return [Date(self._date_text(date_pair)[0], self._date_text(date_pair)[1], date_pair)
for date_pair in self._date_collector(self.find('./{0}originInfo'.format(mods)))]
except TypeError:
return None
@property
def digital_origin(self):
"""
Get text from mods:edition element.
:return: String containing digital origin information.
"""
try:
return self.find('.//{0}digitalOrigin'.format(mods)).text
except AttributeError:
return None
@property
def doi(self):
"""
:return: Item's DOI or None.
"""
try:
return self._identifier(id_type='DOI')[0].text
except IndexError:
return None
@property
def edition(self):
"""
Accesses mods:edition element.
:return: Edition element text or None.
"""
try:
return self.find('.//{0}edition'.format(mods)).text
except AttributeError:
return None
@property
def extent(self):
"""
Accesses mods:extent element.
:return: A list of mods:extent texts.
"""
return [extent.text for extent in self.iterfind('.//{0}extent'.format(mods))]
@property
def form(self):
"""
Accesses mods:physicalDescription/mods:form element.
:return: A list of mods:form texts.
"""
return [form.text for form in self.iterfind('./{0}physicalDescription/{0}form'.format(mods))]
@property
def genre(self):
"""
Accesses mods:genre element.
:return: A list containing Genre elements with term, uri, authority,
and authorityURI attributes.
"""
return [Genre(genre.text,
genre.attrib.get('valueURI'),
genre.attrib.get('authority'),
genre.attrib.get('authorityURI'),
genre)
for genre in self.iterfind('./{0}genre'.format(mods))]
@property
def geographic_code(self):
"""
Accesses mods:geographicCode element.
:return: A list of mods:geographicCode texts.
"""
return [geocode.text for geocode in self.iterfind('./{0}subject/{0}geographicCode'.format(mods))]
@property
def get_corp_names(self):
"""
Separates corporate names from other name types.
:return: A list of corporate names.
"""
return sorted([name for name in self.get_names(type='corporate')])
@property
def get_creators(self):
"""
Separates creator names from other name roles.
:return: A list of creator names.
"""
return sorted([name for name in self.get_names(role='Creator')]) # TODO: this needs to flexible to code='cre'
[docs] def get_names(self, **kwargs):
"""
A customizable name query service. Subsets of all record names can be identified by
type ('personal', 'corporate', etc.), name authority, or role.
:param kwargs: A key, value pair of type="*", authority="*", or role="*".
:return: A list of names matching query.
"""
if 'type' in kwargs.keys():
return [name for name in self.names if name.type == kwargs['type']]
elif 'authority' in kwargs.keys():
return [name for name in self.names if name.authority == kwargs['authority']]
elif 'role' in kwargs.keys():
return [name for name in self.names if name.role.text == kwargs['role']]
else:
raise KeyError
[docs] def get_notes(self, **kwargs):
"""
A customizable name query service. Subsets of all record notes can be identified by
type or displayLabel.
:param kwargs: A key, value pair of type="*" or displayLabel="*".
:return: A list of notes matching query.
"""
if 'type' in kwargs.keys():
return [note for note in self.note if note.type == kwargs['type']]
elif 'displayLabel' in kwargs.keys():
return [note for note in self.note if note.displayLabel == kwargs['displayLabel']]
else:
raise KeyError
@property
def get_pers_names(self):
"""
Separates personal names from other name types.
:return: A list of personal names.
"""
return sorted([name for name in self.get_names(type='personal')])
@property
def identifiers(self):
"""
Accesses mods:identifier elements.
:return: A list of identifiers.
"""
return self._identifier()
@property
def iid(self):
"""
A custom FSU identifier service.
:return: Item's IID or None.
"""
try:
return self._identifier(id_type='IID')[0].text
except IndexError:
return None
@property
def internet_media_type(self):
"""
Accesses mods:physicalDescription/mods:internetMediaType element.
:return: A list of mods:internetMediaType texts.
"""
return [mime_type.text for mime_type in
self.iterfind('./{0}physicalDescription/{0}internetMediaType'.format(mods))]
@property
def issuance(self):
"""
Accesses mods:issuance element.
:return: List of mods:issuance texts.
"""
return [issuance.text for issuance in self.iterfind('.//{0}issuance'.format(mods))]
@property
def language(self):
"""
Accesses mods:languageTerm elements.
:return: A list of Language elements with text, code, and authority attributes.
"""
return [Language(language.find('./{0}languageTerm[@type="text"]'.format(mods)).text,
language.find('./{0}languageTerm[@type="code"]'.format(mods)).text,
language.find('./{0}languageTerm[@type="text"]'.format(mods)).attrib.get('authority'),
language)
if len(language) > 1
else Language(None,
language.find('./{0}languageTerm'.format(mods)).text,
language.find('./{0}languageTerm'.format(mods)).attrib.get('authority'),
language)
if language.find('./{0}languageTerm'.format(mods)).text.islower()
else Language(language.find('./{0}languageTerm'.format(mods)).text,
None,
language.find('./{0}languageTerm'.format(mods)).attrib.get('authority'),
language)
for language in self.iterfind('{0}language'.format(mods))]
@property
def names(self):
"""
General mods:name service.
:return: A list of Name elements with text, uri, authority, and authorityURI attributes.
"""
return [Name(name._name_text(),
name.attrib.get('type'),
name.attrib.get('valueURI'),
name.attrib.get('authority'),
name.attrib.get('authorityURI'),
name._name_role(),
name)
for name in self.iterfind('./{0}name'.format(mods))]
@property
def name_parts(self):
"""
Not currently implemented.
:return:
"""
return NotImplemented
# TODO: return unformatted name parts for transformation scenarios
@property
def note(self):
"""
Access mods:note elements.
:return: A list containing Note elements with text, type, and displayLabel attributes.
"""
return [Note(note.text, note.attrib.get('type'), note.attrib.get('displayLabel'), note)
for note in self.iterfind('./{0}note'.format(mods))]
@property
def physical_description_note(self):
"""
Access mods:physicalDescription/mods:note elements and return a list of text values.
:return: A list of note text values.
"""
return [note.text for note in self.findall('./{0}physicalDescription/{0}note'.format(mods))]
@property
def physical_location(self):
"""
Access mods:mods/mods:location/mods:physicalLocation and return text values.
:return: A list of element text values.
"""
return self._physical_location()
@property
def pid(self):
"""
Get fedora PID from MODS record.
:return: Item's fedora PID or None.
"""
try:
return self._identifier(id_type='fedora')[0].text
except IndexError:
return None
@property
def publication_place(self):
"""
Accesses mods:originInfo/mods:place elements.
:return: A list of PublicationPlace elements with text and type attributes.
"""
return [PublicationPlace(place.text, place.attrib.get('type'), place)
for place in self.iterfind('./{0}originInfo/{0}place/{0}placeTerm'.format(mods))]
@property
def publisher(self):
"""
Accesses mods:publisher elements.
:return: A list of element text values.
"""
return [publisher.text for publisher in
self.findall('./{0}originInfo/{0}publisher'.format(mods))]
@property
def purl(self):
"""
Retrieves record's Persistent URL from mods:mods/mods:location/mods:url.
:return: List of strings.
"""
purl = re.compile('((http)(s)?(://purl)[\w\d:#@%/;$()~_?\+-=\\\.&]+)')
return [url.text for url in self.iterfind('./{0}location/{0}url'.format(mods)) if purl.search(url.text)]
@property
def rights(self):
"""
Access mods:accessCondition and return values.
:return: A list containing Rights elements with text, type, and uri.
"""
return [Rights(rights.text,
rights.attrib.get('type'),
rights.attrib.get('{http://www.w3.org/1999/xlink}href'),
rights)
for rights in self.iterfind('{0}accessCondition'.format(mods))]
@property
def subjects(self):
"""
General subject retrieval service.
:return: list of Subject elements with text, uri, authority and authorityURI values.
"""
return [Subject(subject._subject_text(),
subject[0].attrib.get('valueURI'),
subject.attrib.get('authority'),
subject.attrib.get('authorityURI'),
subject)
if subject.attrib.get('valueURI') is None
else Subject(subject._subject_text(),
subject.attrib.get('valueURI'),
subject.attrib.get('authority'),
subject.attrib.get('authorityURI'),
subject)
for subject in self.iterfind('{0}subject'.format(mods))
if 'geographicCode' not in subject[0].tag]
@property
def subject_parts(self):
"""
Not currently implemented.
:return:
"""
return NotImplemented
# TODO: return unformatted subject parts for transformation scenarios
@property
def table_of_contents(self):
try:
return [toc.text for toc in self.iterfind('{0}tableOfContents'.format(mods))]
except AttributeError:
return None
@property
def titles(self):
"""
General title retrieval service.
:return: A list of title texts.
"""
return [title for title in self._title_part()]
@property
def title_parts(self):
"""
Not currently implemented.
:return:
"""
return NotImplemented
# TODO: return unformatted title parts for transformation scenarios
@property
def type_of_resource(self):
"""
Access mods:typeOfResource and return text value.
:return: Text value or None.
"""
try:
return self.find('./{0}typeOfResource'.format(mods)).text
except AttributeError:
return None
def _date_collector(self, elem):
for tag in DATE_FIELDS:
try:
if elem.find('./{0}'.format(tag)) is not None:
return [elem.findall('./{0}'.format(tag))]
except AttributeError:
pass
def _date_text(self, date_pair):
if len(date_pair) == 1:
return date_pair[0].text, date_pair[0].tag
elif len(date_pair) == 2:
date_list = sorted([date.text for date in date_pair])
return '{0} - {1}'.format(date_list[0], date_list[1]), date_pair[0].tag
def _get_dates(self, elem):
return [date for date in elem.find('./{0}originInfo'.format(mods)).iterchildren()
if date.tag in DATE_FIELDS]
def _get_text(self, elem):
"""Wrapping common use of getattr for safe attribute access."""
return getattr(elem, 'text', None)
def _identifier(self, id_type=None):
"""
:param id_type: A MODSXML @type='id_type' attribute value.
:return: A list of Identifier elements with text and type attributes.
"""
if id_type:
return [Identifier(identifier.text, id_type, identifier)
for identifier in self.iterfind('.//{0}identifier'.format(mods)) if
identifier.attrib.get('type') == id_type]
else:
return [Identifier(identifier.text, identifier.attrib.get('type'), identifier)
for identifier in self.iterfind('.//{0}identifier'.format(mods))]
def _name_part(self, elem=None):
if elem is None:
elem = self
return [NamePart(name.text, name.attrib.get('type'), name) for name in
elem.iterfind('./{0}namePart'.format(mods))]
def _name_role(self, elem=None):
if elem is None:
elem = self
return Role(elem._name_role_text(), elem._name_role_code(), elem._name_role_authority(), elem)
def _name_role_authority(self):
try:
return self.find('.//{0}roleTerm'.format(mods)).attrib.get('authority')
except AttributeError:
return None
def _name_role_code(self):
try:
return self.find('.//{0}roleTerm[@type="code"]'.format(mods)).text
except AttributeError:
return None
def _name_role_text(self):
try:
return self.find('.//{0}roleTerm[@type="text"]'.format(mods)).text
except AttributeError:
return None
def _name_text(self, elem=None):
if elem is None:
elem = self
if elem.attrib.get('type') == 'personal':
family = ', '.join(x.text for x in elem._name_part() if x.type == 'family')
given = ', '.join(x.text for x in elem._name_part() if x.type == 'given')
terms_of_address = ', '.join(x.text for x in elem._name_part() if x.type == 'termsOfAddress')
date = ', '.join(x.text for x in elem._name_part() if x.type == 'date')
untyped_name = ', '.join(x.text for x in elem._name_part() if x.type is None)
return '{family}{given}{termsOfAddress}{untyped_name}{date}'.format(
family=family + ', ' if family else '',
given=given if given else '',
termsOfAddress=', ' + terms_of_address if terms_of_address else '',
untyped_name=untyped_name if untyped_name else '',
date=', ' + date if date else ''
)
else:
text = ''
for part in elem.iter(tag='{0}namePart'.format(mods)):
text = text + '{0}, '.format(part.text)
return text.strip(', ')
def _physical_location(self, elem=None):
"""
Access mods:mods/mods:location/mods:physicalLocation and return text values.
:return: A list of text values.
"""
if elem is None:
elem = self
return [location.text for location in elem.iterfind('./{0}location/{0}physicalLocation'.format(mods))]
def _subject_part(self, elem=None):
if elem is None:
elem = self
return [SubjectPart(term._name_text(), term.tag, term)
if 'name' in term.tag
else SubjectPart(term.text, term.tag, term)
for term in elem.iterchildren()]
def _subject_text(self):
subject_text = ''
for subject_part in self._subject_part():
subject_text = subject_text + '{0}--'.format(subject_part.text)
return subject_text.strip('--')
def _title_part(self, elem=None): # TODO - name title stuff to match name&subject methods
"""
:param elem: The element containing a mods:titleInfo elements (i.e. mods:mods or mods:relatedItem).
:return: A list of correctly formatted titles.
"""
if elem is None:
elem = self
return [self._title_text(
self._get_text(title.find('./{0}nonSort'.format(mods))),
self._get_text(title.find('./{0}title'.format(mods))),
self._get_text(title.find('./{0}subTitle'.format(mods))))
for title in elem.iterfind('./{0}titleInfo'.format(mods))]
def _title_text(self, non_sort, title, subtitle):
"""Construct valid title regardless if any constituent part missing."""
return '{non_sort}{title}{subtitle}'.format(
non_sort=non_sort + ' ' if non_sort else '',
title=title if title else '',
subtitle=': ' + subtitle if subtitle else '')
def _url(self, elem):
return [url.text for url in elem.iterfind('./{0}location/{0}url'.format(mods))]
[docs]class OAIRecord(Record):
"""
Record class for records stored in the OAI-PMH format.
OAI documents in either the OAI-PMH standard or
repox export standard are supported.
This class allows access to OAI wrapper data,
such as the OAI record URN. The OAIRecord.metadata property
allows access to the metadata content of the record.
Standard methods from the MODSRecord and DCRecord classes
can be performed on OAIRecord objects through the metadata
property. Internal tests will automatically select the correct
parser and class to return.
"""
def _init(self):
super(OAIRecord, self)._init()
@property
def oai_urn(self):
"""
:return: The OAI ID as a string.
"""
if '{http://repox.ist.utl.pt}' in self.tag:
try:
return self.attrib['id']
except AttributeError:
pass
elif '{http://www.openarchives.org/OAI/2.0/}' in self.tag:
try:
return self.find('{0}header/{0}identifier'.format(NAMESPACES['oai_dc'])).text
except AttributeError:
pass
else:
try:
return self.find('{}header/{}identifier').text
except AttributeError:
pass
@property
def metadata(self):
"""
Exposes the metadata content of an OAIRecord.
:return: A reparsed root element either in the MODSRecord or DCRecord class, as appropriate.
"""
record_data = self.find('./{*}metadata')
if record_data is not None:
try:
if 'mods' in record_data[0].tag:
mods_parser_registration = etree.ElementDefaultClassLookup(element=MODSRecord)
mods_parser = etree.XMLParser()
mods_parser.set_element_class_lookup(mods_parser_registration)
return etree.XML(etree.tostring(record_data[0], encoding='UTF-8').decode('utf-8'),
parser=mods_parser)
elif 'qualified' in record_data[0].tag:
qdc_parser_registration = etree.ElementDefaultClassLookup(element=DCRecord)
qdc_parser = etree.XMLParser()
qdc_parser.set_element_class_lookup(qdc_parser_registration)
return etree.XML(etree.tostring(record_data[0], encoding='UTF-8').decode('utf-8'),
parser=qdc_parser)
elif 'dc' in record_data[0].tag:
dc_parser_registration = etree.ElementDefaultClassLookup(element=DCRecord)
dc_parser = etree.XMLParser()
dc_parser.set_element_class_lookup(dc_parser_registration)
return etree.XML(etree.tostring(record_data[0], encoding='UTF-8').decode('utf-8'),
parser=dc_parser)
except IndexError:
pass
[docs]class DCRecord(Record):
"""
Record class for Dublin Core and Qualified Dublin Core elements.
"""
def _init(self):
super(DCRecord, self)._init()
[docs] def get_element(self, elem, delimiter=None):
"""
:param elem: An element. It can be named explicitly by namespace using Clark Notation,
or using the form '{*}elem' will match elem in any namespace.
:param delimiter: A character used to separate values within a single element.
:return: A list of element values.
"""
if self.find('{0}'.format(elem)) is not None:
if delimiter is None:
return [item.text for item in self.findall('{0}'.format(elem)) if item.text]
else:
return [split_text.strip()
for item in self.findall('{0}'.format(elem)) if item.text
for split_text in item.text.split(delimiter)]