Source code for bin.metadata

#!/usr/bin/env python

import subprocess
from manipulate import Manipulate
from teimanipulate import TeiManipulate

[docs]__author__ = "Martin Paul Eve, Dulip Withanage"
[docs]__email__ = "martin@martineve.com"
[docs]class Metadata(Manipulate): def __init__(self, gv): self.gv = gv self.metadata_items = {} self.metadata = [] self.authors = [] self.debug = self.gv.debug self.dom_to_load = self.gv.input_metadata_file_path self.dom_temp_file = self.gv.input_metadata_file_path self.mod_name = 'Metadata' Manipulate.__init__(self, gv)
[docs] def attach_metadata(self): cmd = ["java", "-classpath", self.gv.java_class_path, "-Dxml.catalog.files=" + self.gv.runtime_catalog_path, "net.sf.saxon.Transform", "-x", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-y", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-r", "org.apache.xml.resolver.tools.CatalogResolver", "-o", self.gv.nlm_file_path, self.gv.nlm_temp_file_path, self.gv.metadata_style_sheet_path, 'metadataFile="' + self.gv.input_metadata_file_path + '"' ] return ' '.join(cmd)
[docs] def run(self): java_command = self.attach_metadata() subprocess.call(java_command, stdin=None, shell=True) # copy back to the temp file for debug purposes Manipulate.update_tmp_file(self.gv.nlm_file_path, self.gv.nlm_temp_file_path) self.debug.print_debug(self, u'Running metadata transform')
[docs] def extract_metadata_fields(self): # load the metadata file for reading tree = self.load_dom_read() metadata = [] # attempt to find identifiers count = 0 ids = tree.xpath('//article-id | //tei:article-id', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for id in ids: text = self.get_stripped_text(id).strip() self.metadata_items['ID{0}'.format(count)] = text self.metadata.append(text) count += 1 self.debug.print_debug(self, u'Extracted an article ID: "{0}" from metadata'.format(text)) self.metadata_items['IDs_count'] = count # attempt to find article titles count = 0 titles = tree.xpath('//article-title | //tei:article-title', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for title in titles: text = self.get_stripped_text(title).strip() self.metadata_items['title{0}'.format(count)] = text self.metadata.append(text) count += 1 self.debug.print_debug(self, u'Extracted an article title: "{0}" from metadata'.format(text)) self.metadata_items['titles_count'] = count # attempt to find journal titles count = 0 titles = tree.xpath('//journal-id | //tei:journal-title', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for title in titles: text = self.get_stripped_text(title).strip() self.metadata_items['journal_title{0}'.format(count)] = text self.metadata.append(text) count += 1 self.debug.print_debug(self, u'Extracted a journal title: "{0}" from metadata'.format(text)) self.metadata_items['journal_titles_count'] = count # attempt to find authors count = 0 titles = tree.xpath('//contrib/name | //tei:contrib/tei:name', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) authors = [] for title in titles: components = [] for component in title: text = self.get_stripped_text(component).strip() self.debug.print_debug(self, u'Extracted a name component: "{0}" from metadata'.format(text)) components.append(text) self.metadata_items['contrib_name{0}'.format(count)] = components self.authors.append(components) count += 1 self.metadata_items['contrib_names_count'] = count # attempt to find affiliations count = 0 titles = tree.xpath('//aff | //tei:aff', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for title in titles: text = self.get_stripped_text(title).strip() self.metadata_items['aff{0}'.format(count)] = text self.metadata.append(text) count += 1 self.debug.print_debug(self, u'Extracted an affiliation: "{0}" from metadata'.format(text)) self.metadata_items['affs_count'] = count
[docs] def pre_clean(self): self.extract_metadata_fields() manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # get all elements in the body section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) items_to_match = ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p', '{http://www.tei-c.org/ns/1.0}cit'] count = 0 matched_authors = [] for item in section: if count > 2: break if item.tag in items_to_match: count += 1 text = self.get_stripped_text(item) processed = False for author in self.authors: if not author in matched_authors: has_all = True for component in author: if not component in text: has_all = False break if has_all: # found a metadata line matched_authors.append(author) count -= 1 item.getparent().remove(item) self.debug.print_debug(self, u'Removed line "{0}" ' u'because it appears to be author metadata'.format(text)) processed = True break if not processed: for metadata in self.metadata: if metadata in text: # found a metadata line count -= 1 item.getparent().remove(item) self.debug.print_debug(self, u'Removed line "{0}" ' u'because it appears to be duplicated metadata'.format(text)) manipulate.save_tree(tree)