Source code for bin.teitonlm

#!/usr/bin/env python
#@Author Dulip Withanage
import subprocess
import shutil
from lxml import etree
from nlmmanipulate import NlmManipulate
from teimanipulate import TeiManipulate
from debug import Debuggable


[docs]class TeiToNlm (Debuggable): def __init__(self, gv): self.gv = gv self.module_name = "TEI to NLM" self.debug = gv.debug super(Debuggable, self).__init__()
[docs] def saxon_tei2nlm(self): cmd = ["java", "-classpath", self.gv.java_class_path, "-Dxml.catalog.files=" + self.gv.runtime_catalog_path, "net.sf.saxon.Transform", "-x", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-y", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-r", "org.apache.xml.resolver.tools.CatalogResolver", "-o", self.gv.nlm_temp_file_path, self.gv.tei_file_path, self.gv.nlm_style_sheet_dir, 'autoBlockQuote=true' ] return ' '.join(cmd)
[docs] def run_quirks(self, process_ref_lists): manipulate = NlmManipulate(self.gv) if self.gv.settings.get_setting('linebreaks-as-comments', self) == 'False': # we need to convert every instance of <!--meTypeset:br--> to a new paragraph manipulate.close_and_open_tag('comment()[. = "meTypeset:br"]', 'p') manipulate.close_and_open_tag_not_styled('comment()[. = "meTypeset:br"]', 'title') # we will replace inside table cells and titles regardless because these are real JATS break tags manipulate.insert_break('comment()[. = "meTypeset:br"]', 'td') manipulate.insert_break('comment()[. = "meTypeset:br"]', 'title') manipulate.remove_empty_elements('//sec//p') if process_ref_lists: self.debug.print_debug(self, u'Finding potential reference lists') manipulate.find_reference_list() manipulate.tag_bibliography_refs() manipulate.remove_empty_elements('//sec/list') manipulate.remove_empty_elements('//sec/disp-quote') manipulate.remove_empty_elements('//back/ref-list/ref')
[docs] def pre_cleanup(self): manipulate = TeiManipulate(self.gv) tree = manipulate.load_dom_tree() # make sure that head elements are not encapsulated within any elements that will stop them from being # correctly transformed by the XSL allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body'] head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) count = 0 for element in head_elements: current = element while current is not None: current = current.getparent() if current is not None: if current.tag and current.tag not in allowed: current.tag = 'REMOVE' count += 1 elif current.tag and current.tag in allowed: break else: break if count > 0: etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count)) # split any p tags with sub-tags hi rend="Indent" into new elements biblio_elements = tree.xpath('//tei:p' '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for parent in biblio_elements: add_position = parent for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or ' 'contains(@rend, "Text Body")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): new_p = etree.Element('p') if 'rend' in parent.attrib: new_p.attrib['rend'] = parent.attrib['rend'] add_position.addnext(new_p) new_p.append(element) add_position = new_p manipulate.save_tree(tree) self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))
[docs] def run_transform(self): self.pre_cleanup() self.gv.mk_dir(self.gv.nlm_folder_path) java_command = self.saxon_tei2nlm() self.debug.print_debug(self, u'Running saxon transform (TEI->NLM)') subprocess.call(java_command, stdin=None, shell=True) if self.gv.nlm_temp_file_path != self.gv.nlm_file_path: shutil.copy2(self.gv.nlm_temp_file_path, self.gv.nlm_file_path)
[docs] def run(self, process_ref_lists, transform=True): if transform: self.run_transform() if '--purenlm' in self.gv.settings.args and self.gv.settings.args['--purenlm']: return self.run_quirks(process_ref_lists)