Source code for bin.metadata
#!/usr/bin/env python
import subprocess
from manipulate import Manipulate
from teimanipulate import TeiManipulate
[docs]class Metadata(Manipulate):
def __init__(self, gv):
self.gv = gv
self.metadata_items = {}
self.metadata = []
self.authors = []
self.debug = self.gv.debug
self.dom_to_load = self.gv.input_metadata_file_path
self.dom_temp_file = self.gv.input_metadata_file_path
self.mod_name = 'Metadata'
Manipulate.__init__(self, gv)
[docs] def attach_metadata(self):
cmd = ["java", "-classpath", self.gv.java_class_path,
"-Dxml.catalog.files=" + self.gv.runtime_catalog_path,
"net.sf.saxon.Transform",
"-x", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-y", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-r", "org.apache.xml.resolver.tools.CatalogResolver",
"-o", self.gv.nlm_file_path,
self.gv.nlm_temp_file_path,
self.gv.metadata_style_sheet_path,
'metadataFile="' + self.gv.input_metadata_file_path + '"'
]
return ' '.join(cmd)
[docs] def run(self):
java_command = self.attach_metadata()
subprocess.call(java_command, stdin=None, shell=True)
# copy back to the temp file for debug purposes
Manipulate.update_tmp_file(self.gv.nlm_file_path, self.gv.nlm_temp_file_path)
self.debug.print_debug(self, u'Running metadata transform')
[docs] def extract_metadata_fields(self):
# load the metadata file for reading
tree = self.load_dom_read()
metadata = []
# attempt to find identifiers
count = 0
ids = tree.xpath('//article-id | //tei:article-id', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for id in ids:
text = self.get_stripped_text(id).strip()
self.metadata_items['ID{0}'.format(count)] = text
self.metadata.append(text)
count += 1
self.debug.print_debug(self, u'Extracted an article ID: "{0}" from metadata'.format(text))
self.metadata_items['IDs_count'] = count
# attempt to find article titles
count = 0
titles = tree.xpath('//article-title | //tei:article-title', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for title in titles:
text = self.get_stripped_text(title).strip()
self.metadata_items['title{0}'.format(count)] = text
self.metadata.append(text)
count += 1
self.debug.print_debug(self, u'Extracted an article title: "{0}" from metadata'.format(text))
self.metadata_items['titles_count'] = count
# attempt to find journal titles
count = 0
titles = tree.xpath('//journal-id | //tei:journal-title', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for title in titles:
text = self.get_stripped_text(title).strip()
self.metadata_items['journal_title{0}'.format(count)] = text
self.metadata.append(text)
count += 1
self.debug.print_debug(self, u'Extracted a journal title: "{0}" from metadata'.format(text))
self.metadata_items['journal_titles_count'] = count
# attempt to find authors
count = 0
titles = tree.xpath('//contrib/name | //tei:contrib/tei:name',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
authors = []
for title in titles:
components = []
for component in title:
text = self.get_stripped_text(component).strip()
self.debug.print_debug(self, u'Extracted a name component: "{0}" from metadata'.format(text))
components.append(text)
self.metadata_items['contrib_name{0}'.format(count)] = components
self.authors.append(components)
count += 1
self.metadata_items['contrib_names_count'] = count
# attempt to find affiliations
count = 0
titles = tree.xpath('//aff | //tei:aff', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for title in titles:
text = self.get_stripped_text(title).strip()
self.metadata_items['aff{0}'.format(count)] = text
self.metadata.append(text)
count += 1
self.debug.print_debug(self, u'Extracted an affiliation: "{0}" from metadata'.format(text))
self.metadata_items['affs_count'] = count
[docs] def pre_clean(self):
self.extract_metadata_fields()
manipulate = TeiManipulate(self.gv)
tree = manipulate.load_dom_tree()
# get all elements in the body
section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
items_to_match = ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p',
'{http://www.tei-c.org/ns/1.0}cit']
count = 0
matched_authors = []
for item in section:
if count > 2:
break
if item.tag in items_to_match:
count += 1
text = self.get_stripped_text(item)
processed = False
for author in self.authors:
if not author in matched_authors:
has_all = True
for component in author:
if not component in text:
has_all = False
break
if has_all:
# found a metadata line
matched_authors.append(author)
count -= 1
item.getparent().remove(item)
self.debug.print_debug(self, u'Removed line "{0}" '
u'because it appears to be author metadata'.format(text))
processed = True
break
if not processed:
for metadata in self.metadata:
if metadata in text:
# found a metadata line
count -= 1
item.getparent().remove(item)
self.debug.print_debug(self, u'Removed line "{0}" '
u'because it appears to be duplicated metadata'.format(text))
manipulate.save_tree(tree)