Source code for bin.teitonlm
#!/usr/bin/env python
#@Author Dulip Withanage
import subprocess
import shutil
from lxml import etree
from nlmmanipulate import NlmManipulate
from teimanipulate import TeiManipulate
from debug import Debuggable
[docs]class TeiToNlm (Debuggable):
def __init__(self, gv):
self.gv = gv
self.module_name = "TEI to NLM"
self.debug = gv.debug
super(Debuggable, self).__init__()
[docs] def saxon_tei2nlm(self):
cmd = ["java", "-classpath", self.gv.java_class_path,
"-Dxml.catalog.files=" + self.gv.runtime_catalog_path,
"net.sf.saxon.Transform",
"-x", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-y", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-r", "org.apache.xml.resolver.tools.CatalogResolver",
"-o", self.gv.nlm_temp_file_path,
self.gv.tei_file_path,
self.gv.nlm_style_sheet_dir,
'autoBlockQuote=true'
]
return ' '.join(cmd)
[docs] def run_quirks(self, process_ref_lists):
manipulate = NlmManipulate(self.gv)
if self.gv.settings.get_setting('linebreaks-as-comments', self) == 'False':
# we need to convert every instance of <!--meTypeset:br--> to a new paragraph
manipulate.close_and_open_tag('comment()[. = "meTypeset:br"]', 'p')
manipulate.close_and_open_tag_not_styled('comment()[. = "meTypeset:br"]', 'title')
# we will replace inside table cells and titles regardless because these are real JATS break tags
manipulate.insert_break('comment()[. = "meTypeset:br"]', 'td')
manipulate.insert_break('comment()[. = "meTypeset:br"]', 'title')
manipulate.remove_empty_elements('//sec//p')
if process_ref_lists:
self.debug.print_debug(self, u'Finding potential reference lists')
manipulate.find_reference_list()
manipulate.tag_bibliography_refs()
manipulate.remove_empty_elements('//sec/list')
manipulate.remove_empty_elements('//sec/disp-quote')
manipulate.remove_empty_elements('//back/ref-list/ref')
[docs] def pre_cleanup(self):
manipulate = TeiManipulate(self.gv)
tree = manipulate.load_dom_tree()
# make sure that head elements are not encapsulated within any elements that will stop them from being
# correctly transformed by the XSL
allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body']
head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
count = 0
for element in head_elements:
current = element
while current is not None:
current = current.getparent()
if current is not None:
if current.tag and current.tag not in allowed:
current.tag = 'REMOVE'
count += 1
elif current.tag and current.tag in allowed:
break
else:
break
if count > 0:
etree.strip_tags(tree, 'REMOVE')
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count))
# split any p tags with sub-tags hi rend="Indent" into new elements
biblio_elements = tree.xpath('//tei:p'
'[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
'contains(@rend, "Text Body")]]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for parent in biblio_elements:
add_position = parent
for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
'contains(@rend, "Text Body")]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
new_p = etree.Element('p')
if 'rend' in parent.attrib:
new_p.attrib['rend'] = parent.attrib['rend']
add_position.addnext(new_p)
new_p.append(element)
add_position = new_p
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))
[docs] def run(self, process_ref_lists, transform=True):
if transform:
self.run_transform()
if '--purenlm' in self.gv.settings.args and self.gv.settings.args['--purenlm']:
return
self.run_quirks(process_ref_lists)