[docs]__author__ = "Martin Paul Eve"
[docs]__email__ = "martin@martineve.com"
from manipulate import Manipulate
from lxml import etree
import re
import uuid
[docs]class NlmManipulate(Manipulate):
def __init__(self, gv):
self.gv = gv
self.debug = self.gv.debug
self.dom_to_load = self.gv.nlm_file_path
self.dom_temp_file = self.gv.nlm_temp_file_path
self.namespaces = {'jats':'http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd',
'xmlns:xlink':'http://www.w3.org/1999/xlink'}
self.mod_name = 'NLM'
Manipulate.__init__(self, gv)
[docs] def remove_reference_numbering(self):
tree = self.load_dom_tree()
for ref in tree.xpath('//ref'):
if hasattr(ref, 'text') and ref.text is not None:
text = ref.text
ref.text = re.sub(r'^\d\d*\s*\.?\s*', r'', text)
self.save_tree(tree)
[docs] def remove_empty_elements(self, element):
tree = self.load_dom_tree()
for paragraph in tree.xpath(element):
found = False
text = self.get_stripped_text(paragraph).strip()
if text == '':
for item in paragraph:
if self.get_stripped_text(item) != '' or item.tag == 'graphic':
found = True
break
else:
found = True
if not found and (paragraph.tail is None or paragraph.tail == ''):
paragraph.getparent().remove(paragraph)
self.save_tree(tree)
self.debug.print_debug(self, u'Removed an empty element')
elif not found and paragraph.tail != '':
sibling = paragraph.getprevious()
if sibling is None:
if paragraph.getparent().text is not None:
paragraph.getparent().text += paragraph.tail
else:
paragraph.getparent().text = paragraph.tail
else:
sibling.tail = paragraph.tail
paragraph.getparent().remove(paragraph)
self.save_tree(tree)
self.debug.print_debug(self, u'Removed an empty element but preserved tail')
self.save_tree(tree)
[docs] def double_p_compliance(self):
self.debug.print_debug(self, u'Attempting to correct any mis-nested paragraph elements')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
bad_ps = tree.xpath('//p/p')
for p in bad_ps:
p_parent = p.getparent()
parent = p_parent.getparent()
parent.insert(parent.index(p_parent)+1, p)
self.save_tree(tree)
@staticmethod
[docs] def handle_nested_elements(iter_node, move_node, node, node_parent, outer_node, tag_name, tail_stack,
tail_stack_objects):
if iter_node is None:
return None, None, None
while iter_node.tag != tag_name:
tail_stack.append(iter_node.tag)
tail_stack_objects.append(iter_node)
iter_node = iter_node.getparent()
if iter_node is None:
return None, None, None
# get the tail (of the comment) and style it
append_location = None
tail_text = node.tail
iterator = 0
tail_stack.reverse()
tail_stack_objects.reverse()
# rebuild the styled tree on a set of subelements
for node_to_add in tail_stack:
sub_element = etree.Element(node_to_add)
if iterator == len(tail_stack) - 1:
sub_element.text = node.tail
if iterator == 0:
outer_node = sub_element
iterator += 1
if append_location is None:
tail_stack_objects[0].addnext(sub_element)
append_location = sub_element
else:
Manipulate.append_safe(append_location, sub_element, None)
append_location = sub_element
# remove the old node (this is in the element above)
node.getparent().remove(node)
# set the search node to the outermost node so that we can find siblings
node_parent = iter_node
node = outer_node
move_node = True
return move_node, node, node_parent
@staticmethod
[docs] def search_and_copy(last_node, move_node, nested_sibling, new_element, node, node_parent):
append_location = new_element
new_nodes_to_copy = node.xpath('following-sibling::node()')
last_append = None
for new_node in new_nodes_to_copy:
if type(new_node) is etree._ElementStringResult or type(new_node) is etree._ElementUnicodeResult:
if last_append is None:
last_append = append_location
if move_node:
node.tail = new_node
Manipulate.append_safe(last_append, node, None)
else:
last_append.text = new_node
else:
last_append.tail = new_node
else:
Manipulate.append_safe(append_location, new_node, None)
last_append = new_node
if nested_sibling is None:
node_parent.addnext(new_element)
node_parent.tail = ''
nested_sibling = new_element
else:
nested_sibling.addnext(new_element)
nested_sibling = new_element
# remove the original tail (all text after the line break, for example)
# <!--meTypeset:br-->A third break <-- "a third break" is the tail
if not move_node:
node.tail = ''
last_node = new_element
else:
last_node = None
return last_node
@staticmethod
[docs] def add_error_tag(node, error_number):
rend = 'error-{0}'.format(error_number)
if u'rend' in node.attrib:
if not rend in node.attrib['rend']:
# append the new value
rend = u'{0} {1}'.format(node.attrib[u'rend'], rend)
else:
# just re-write the old value
rend = node.attrib[u'rend']
node.attrib[u'rend'] = rend
[docs] def close_and_open_tag_not_styled(self, search_xpath, tag_name):
"""
Opens and closes an XML tag within a document. This is primarily useful when we have a marker such as
meTypeset:br in comments which corresponds to no JATS/NLM equivalent. We use this function in certain
behavioural modes to close the preceding paragraph and open the next.
This variant only performs this action when the subsequent text does not look like a heading.
@param search_xpath: the node that serves as a marker
@param tag_name: the tag name that will be open and closed
"""
tree = self.load_dom_tree()
initial_nodes = tree.xpath('//{0}//{1}'.format(tag_name, search_xpath))
self.debug.print_debug(self, u'Found {0} {1} nodes on which to close and open tag {2}'.format(
len(initial_nodes), search_xpath, tag_name))
nested_sibling = None
bail = False
if len(initial_nodes) > 80 and int(self.gv.settings.args["--aggression"]) < 11:
self.debug.print_debug(self, u'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath))
self.debug.write_error(self,
'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath),
'001')
bail = True
if not bail:
for node in initial_nodes:
sibling = node
while sibling.getnext() is not None:
try:
if sibling.tag.endswith('bold'):
bail = True
except:
bail = True
break
if not bail:
self.process_node_for_tags(nested_sibling, node, search_xpath, tag_name, 'p')
else:
# add an error tag to p elements where there are more than 3 comments within
children = tree.xpath('//*[count(comment()[.="meTypeset:br"]) > 3]'.format(search_xpath))
for child in children:
self.add_error_tag(child, u'001')
self.save_tree(tree)
[docs] def close_and_open_tag(self, search_xpath, tag_name):
"""
Opens and closes an XML tag within a document. This is primarily useful when we have a marker such as
meTypeset:br in comments which corresponds to no JATS/NLM equivalent. We use this function in certain
behavioural modes to close the preceding paragraph and open the next.
@param search_xpath: the node that serves as a marker
@param tag_name: the tag name that will be open and closed
"""
tree = self.load_dom_tree()
initial_nodes = tree.xpath('//{0}//{1}'.format(tag_name, search_xpath))
self.debug.print_debug(self, u'Found {0} {1} nodes on which to close and open tag {2}'.format(
len(initial_nodes), search_xpath, tag_name))
nested_sibling = None
bail = False
if len(initial_nodes) > 80 and int(self.gv.settings.args["--aggression"]) < 11:
self.debug.print_debug(self, u'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath))
self.debug.write_error(self,
'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath),
'001')
bail = True
if not bail:
for node in initial_nodes:
if not bail:
self.process_node_for_tags(nested_sibling, node, search_xpath, tag_name)
else:
# add an error tag to p elements where there are more than 3 comments within
children = tree.xpath('//*[count(comment()[.="meTypeset:br"]) > 3]'.format(search_xpath))
for child in children:
self.add_error_tag(child, u'001')
self.save_tree(tree)
[docs] def save_tree(self, tree):
tree.write(self.dom_temp_file, pretty_print=True)
tree.write(self.dom_to_load, pretty_print=True)
[docs] def find_text(self, paragraph, text):
if paragraph.text and text in paragraph.text:
return paragraph, False
if paragraph.tail and text in paragraph.tail:
return paragraph, True
for sub_element in paragraph:
ret, tail = self.find_text(sub_element, text)
if ret is not None:
return ret, tail
return None, False
[docs] def insert_break(self, search_xpath, tag_name):
"""
Opens and closes an XML tag within a document. This is primarily useful when we have a marker such as
meTypeset:br in comments which corresponds to no JATS/NLM equivalent. We use this function in certain
behavioural modes to close the preceding paragraph and open the next.
@param search_xpath: the node that serves as a marker
@param tag_name: the tag name that will be open and closed
"""
tree = self.load_dom_tree()
initial_nodes = tree.xpath('//{0}//{1}'.format(tag_name,search_xpath))
self.debug.print_debug(self, u'Found {0} {1} nodes on which to insert break: {2}'.format(
len(initial_nodes), search_xpath, tag_name))
for node in initial_nodes:
break_element = etree.Element('break')
node.addnext(break_element)
node.getparent().remove(node)
self.save_tree(tree)
[docs] def reflist_indent_method(self, tree):
# tag the last item as a reference list
indentmethod = tree.xpath('(//sec[title][disp-quote] | //sec[title][list])[last()]')
if indentmethod:
for item in indentmethod:
item.attrib['reflist'] = 'yes'
[docs] def reflist_year_match_method(self, tree, root, tolerance):
sections = tree.xpath(root)
# work upwards as the last section is most likely to contain references
for element in reversed(sections):
found_other = False
count = 0
use_tag = None
diff_count = 0
for p in element:
# use either p or disp-quote, but not a mix
if use_tag is None:
if p.tag == 'p' or p.tag == 'disp-quote' or p.tag == 'list-item':
use_tag = p.tag
if p.tag == use_tag:
for sub_element in p:
if sub_element.tag == 'p':
p = sub_element
break
text = self.get_stripped_text(p)
year_test = re.compile('((1|2)\d{3}[a-z]?)|(n\.d\.)')
match = year_test.findall(text)
if not match:
blank_text = re.compile('XXXX')
match_inner = blank_text.findall(text)
if not match_inner:
diff_count += 1
if diff_count > tolerance:
self.debug.print_debug(self, u'Too many different non-year matches found in this'
u' {1} section to classify as a reference block. '
u'(Allowed: {0})'.format(tolerance, root))
found_other = True
break
elif len(match_inner) == 1:
count += 1
p.attrib['rend'] = 'ref'
else:
page_test = re.compile('(((18|19|20)\d{2})\-((18|19|20)\d{2}))')
is_page_range = page_test.search(text)
if not is_page_range:
self.debug.print_debug(self, u'More than one year match found in this {0}'.format(root))
found_other = True
break
elif len(match) == 1:
# only do this if we find 1 match on the line; otherwise, it's a problem
count += 1
p.attrib['rend'] = 'ref'
else:
page_test = re.compile('(((18|19|20)\d{2})\-((18|19|20)\d{2}))')
is_page_range = page_test.search(text)
if not is_page_range:
self.debug.print_debug(self, u'More than one year match found in this {0}'.format(root))
found_other = True
break
elif p.tag != 'title' and not use_tag is None:
# found a tag other than the one we want or 'title'
diff_count += 1
if diff_count > tolerance:
self.debug.print_debug(self, u'Too many different elements found in this {1} section to '
u'classify as a reference block. (Allowed: {0})'.format(tolerance,
root))
found_other = True
break
if count > 1 and not found_other:
self.debug.print_debug(self, u'Found a reference list in a {0} block with '
u'tolerance {1}'.format(root, tolerance))
while element.tag != 'sec':
element = element.getparent()
element.attrib['reflist'] = 'yes'
return True
else:
for p in element:
if 'rend' in p.attrib:
del p.attrib['rend']
return False
[docs] def find_or_create_element(self, tree, element_tag, add_xpath, is_sibling):
# find_or_create_elements(tree, 'back', '//body', true)
ret = None
try:
ret = tree.xpath(u'//' + element_tag, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})[0]
self.debug.print_debug(self, u'Found existing {0}. Using it.'.format(element_tag))
except:
self.debug.print_debug(self, u'Unable to find an existing {0} element.'.format(element_tag))
if ret is None:
self.debug.print_debug(self, u'Creating new {0} element.'.format(element_tag))
ret = tree.xpath(add_xpath)[0]
new_element = etree.Element(element_tag)
if is_sibling:
ret.addnext(new_element)
else:
Manipulate.append_safe(ret, new_element, self)
ret = new_element
return ret
[docs] def delete_special_lines(self):
tree = self.load_dom_tree()
special_regex = re.compile('^[\-\.\,\+\#\'\;\:]+$')
to_remove = []
for ref in tree.xpath('//p'):
text = self.get_stripped_text(ref)
if special_regex.match(text):
ref.getparent().remove(ref)
self.save_tree(tree)
self.debug.print_debug(self, u'Removing special character line: {0}'.format(text))
[docs] def clean_refs(self):
tree = self.load_dom_tree()
ref_regex = re.compile('^(?P<prelim>\s*\d+[\.\,]?\s+)(?P<reference>.+)')
for ref in tree.xpath('//back/ref-list/ref'):
if ref.text and ref_regex.match(ref.text):
ref.text = ref_regex.sub('\\g<reference>', ref.text)
self.save_tree(tree)
self.debug.print_debug(self,
u'Removing number/whitespace from start of reference: {0}'.format(ref.text))
for ref in tree.xpath('//back/ref-list/ref[not(element-citation)]'):
new_ref = etree.Element('ref')
ref.addnext(new_ref)
ref.tag = 'mixed-citation'
new_ref.append(ref)
if 'id' in ref.attrib:
new_ref.attrib['id'] = ref.attrib['id']
del ref.attrib['id']
self.save_tree(tree)
self.debug.print_debug(self, u'Encapsulated any loose refs inside mixed-citation blocks')
[docs] def final_clean(self):
self.delete_special_lines()
self.handle_stranded_reference_titles_from_cues()
self.clean_refs()
self.remove_empty_elements('//fn-group')
self.remove_empty_elements('//p')
self.remove_empty_elements('//ref-list')
[docs] def find_reference_list(self):
if self.gv.used_list_method or self.gv.used_square_reference_method:
return
tree = self.load_dom_tree()
# look for sections where very paragraph contains a year; likely to be a reference
tags = ['//sec', '//sec/list']
found = False
for tag in tags:
found = self.reflist_year_match_method(tree, tag, 0)
if not found:
found = self.reflist_year_match_method(tree, tag, 1)
if not found:
found = self.reflist_year_match_method(tree, tag, 2)
if not found:
found = self.reflist_year_match_method(tree, tag, 3)
self.save_tree(tree)
[docs] def handle_stranded_reference_titles_from_cues(self):
# this method looks for paragraphs with one title element and nothing else whose text is in our
# linguistic cues documents. It then removed them as superfluous.
self.debug.print_debug(self, u'Checking for any stranded titles as a result of reference parsing')
tree = self.load_dom_tree()
xpath = '//sec[(count(p) = 0) and (count(title) = 1)]'
language_list = self.gv.settings.get_setting('reference-languages', self).split(',')
reference_terms = []
for language in language_list:
with open ('{0}/language/ref_marker_{1}.txt'.format(self.gv.script_dir, language), 'r', encoding="utf8") as lang_file:
lines = lang_file.read().split('\n')
for line in lines:
reference_terms.append(line.lower())
for sections in tree.xpath(xpath):
process = True
for item in sections:
if item.tag != 'title':
process = False
if process:
for item in sections:
text = self.get_stripped_text(item).strip()
if text.lower() in reference_terms:
sections.getparent().remove(sections)
self.save_tree(tree)
self.debug.print_debug(self, u'Removed a stranded title: {0}'.format(text))
[docs] def fuse_references(self):
tree = self.load_dom_tree()
for ref in tree.xpath('//back/ref-list/ref'):
text = self.get_stripped_text(ref)
year_test = re.compile('((1|2)\d{3}[a-z]?)|(n\.d\.)')
match = year_test.findall(text)
if not match and ref.getprevious() is not None:
ref.tag = 'REMOVE'
ref.getprevious().append(ref)
etree.strip_tags(tree, 'REMOVE')
self.save_tree(tree)
self.debug.print_debug(self, u'Appending {0} to previous ref'.format(text))
[docs] def tag_bibliography_refs(self):
tree = self.load_dom_tree()
existing_refs = tree.xpath('//back/ref-list')
if len(existing_refs) > 0:
return
self.find_or_create_element(tree, 'back', '//body', True)
ref_list = self.find_or_create_element(tree, 'ref-list', '//back', False)
# change this to find <reflist> elements after we're more certain of how to identify them
for refs in tree.xpath('//sec[@reflist="yes"]/p[@rend="ref"] | //sec[@reflist="yes"]/title '
'| //sec[@reflist="yes"]/*/listitem/p[@rend="ref"] | '
'//sec[@reflist="yes"]/*/p[@rend="ref"]'):
if refs.tag == 'title':
self.debug.print_debug(self, u'Removing title element from reference item')
refs.getparent().remove(refs)
else:
self.debug.print_debug(self, u'Tagging element "{0}" as reference item'.format(refs.tag))
refs.tag = 'ref'
refs.attrib['id'] = u'ID{0}'.format(uuid.uuid4())
if 'rend' in refs.attrib:
del refs.attrib['rend']
Manipulate.append_safe(ref_list, refs, self)
self.save_tree(tree)