#!/usr/bin/env python
"""captionclassfier: a tool to search for potential table titles and then link in-text entities
Usage:
captionclassfier.py tables <input> [options]
captionclassfier.py graphics <input> [options]
captionclassfier.py all <input> [options]
Options:
-d, --debug Enable debug output
-h, --help Show this screen.
-v, --version Show version.
"""
from docopt import docopt
from bare_globals import GV
from debug import Debuggable
from nlmmanipulate import NlmManipulate
from lxml import etree
import uuid
import re
import editdistance
[docs]class CaptionClassifier(Debuggable):
def __init__(self, global_variables):
self.gv = global_variables
self.debug = self.gv.debug
Debuggable.__init__(self, 'Caption Classifier')
[docs] def replace_in_text(self, id, element, replace_text, ref_type):
before_after = element.text.split(replace_text, 1)
element.text = before_after[0]
new_element = etree.Element('xref')
new_element.attrib['rid'] = id
new_element.attrib['ref-type'] = ref_type
new_element.text = replace_text
new_element.tail = ''.join(before_after[1:])
NlmManipulate.append_safe(element, new_element, self)
[docs] def replace_in_tail(self, id, element, replace_text, ref_type):
before_after = element.tail.split(replace_text, 1)
new_element = etree.Element('xref')
new_element.attrib['rid'] = id
new_element.attrib['ref-type'] = ref_type
new_element.text = replace_text
new_element.tail = ''.join(before_after[1:])
element.getparent().insert(element.getparent().index(element) + 1, new_element)
element.tail = before_after[0]
return new_element
[docs] def link(self, table_ids, replace_texts, paragraphs, ref_type):
# this procedure is more complex than desirable because the content can appear between tags (like italic)
# otherwise it would be a straight replace
for paragraph in paragraphs:
for replace_text in replace_texts:
table_id = table_ids[replace_texts.index(replace_text)]
if paragraph.text is not None and replace_text in paragraph.text:
self.replace_in_text(table_id, paragraph, replace_text, ref_type)
self.debug.print_debug(self, u'Successfully linked {0} to {1}'.format(replace_text, table_id))
for sub_element in paragraph:
if sub_element.tag != 'xref':
if sub_element.text and replace_text in sub_element.text:
self.replace_in_text(table_id, sub_element, replace_text, ref_type)
self.debug.print_debug(self,
u'Successfully linked {0} to '
u'{1} from sub-element'.format(replace_text, table_id))
if sub_element.tail is not None and replace_text in sub_element.tail:
new_element = self.replace_in_tail(table_id, sub_element, replace_text, ref_type)
self.debug.print_debug(self,
u'Successfully linked {0} to {1} from sub-tail'.format(replace_text,
table_id))
[docs] def run_graphics_sibling(self):
# images are hard to handle because Word/OO puts them in different places
# for instance, the caption can come before or after;
# <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
# orientation="portrait" xlink:type="simple"/>
self.debug.print_debug(self, u'Attempting to classify captions for graphics objects [sibling]')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
graphics = tree.xpath('//graphic')
graphic_titles = []
graphic_ids = []
graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')
separator = ':'
for graphic in graphics:
use_next = False
use_previous = False
# get the next sibling
p = graphic.getparent().getnext()
pprev = graphic.getparent().getprevious()
if p is not None and p.tag == 'p':
text = manipulate.get_stripped_text(p)
if graphic_regex_colon.match(text):
use_next = True
separator = ':'
elif graphic_regex_dot.match(text):
use_next = True
separator = '.'
if not use_next:
if pprev is not None and pprev.tag == 'p':
text = manipulate.get_stripped_text(pprev)
if graphic_regex_colon.match(text):
use_previous = True
separator = ':'
elif graphic_regex_dot.match(text):
use_previous = True
separator = '.'
if not use_next or use_previous:
# see if the title in this section potentially contains text we can match
parent = graphic.getparent()
while parent is not None and not parent.tag.endswith('sec'):
parent = parent.getparent()
if parent is not None:
titles = parent.xpath('title')
else:
titles = []
if len(titles) > 0:
p = titles[0]
text = manipulate.get_stripped_text(p)
if graphic_regex_colon.match(text):
use_next = True
separator = ':'
elif graphic_regex_dot.match(text):
use_next = True
separator = '.'
if use_next or use_previous:
if use_next:
text = manipulate.get_stripped_text(p)
else:
text = manipulate.get_stripped_text(pprev)
p = pprev
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0].strip()
caption = (''.join(split_title[1:])).strip()
self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))
title_element = None
# use an existing title element if one exists
try:
title_element = graphic.xpath('label')[0]
except:
title_element = etree.Element('label')
graphic.insert(0, title_element)
title_element.text = title
caption_element = etree.Element('caption')
new_p = etree.Element('p')
new_p.text = caption
NlmManipulate.append_safe(caption_element, new_p, self)
NlmManipulate.append_safe(graphic, caption_element, self)
if p.tag.endswith('title'):
new_title = etree.Element('title')
new_title.text = ''
p.addnext(new_title)
p.getparent().remove(p)
else:
p.getparent().remove(p)
if graphic.tail:
graphic.tail = graphic.tail.replace(title + separator, '')
graphic.tail = graphic.tail.replace(caption + separator, '')
graphic.tail = graphic.tail.replace(caption, '')
if not 'id' in graphic.attrib:
graphic.attrib['id'] = u'ID{0}'.format(uuid.uuid4())
graphic_titles.append(title)
graphic_ids.append(graphic.attrib['id'])
paragraphs = tree.xpath('//p')
self.link(graphic_ids, graphic_titles, paragraphs, 'fig')
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
[docs] def run_graphics(self):
# images are hard to handle because Word/OO puts them in different places
# for instance, the caption can come before or after;
# <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float"
# orientation="portrait" xlink:type="simple"/>
self.debug.print_debug(self, u'Attempting to classify captions for graphics objects [plain]')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
graphics = tree.xpath('//graphic')
graphic_titles = []
graphic_ids = []
graphic_regex_dot = re.compile('^.+?\s*\d+\..+')
graphic_regex_colon = re.compile('^.+?\s*\d+\:.+')
separator = ':'
for graphic in graphics:
use_next = False
# get the next sibling
p = graphic.getparent()
if p is not None and p.tag == 'p':
text = manipulate.get_stripped_text(p)
if graphic_regex_colon.match(text):
use_next = True
separator = ':'
elif graphic_regex_dot.match(text):
use_next = True
separator = '.'
if use_next:
text = manipulate.get_stripped_text(p)
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0].strip()
caption = (''.join(split_title[1:])).strip()
self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))
title_element = None
# use an existing title element if one exists
try:
title_element = graphic.xpath('label')[0]
except:
title_element = etree.Element('label')
graphic.insert(0, title_element)
title_element.text = title
caption_element = etree.Element('caption')
new_p = etree.Element('p')
new_p.text = caption
NlmManipulate.append_safe(caption_element, new_p, self)
NlmManipulate.append_safe(graphic, caption_element, self)
if graphic.tail:
graphic.tail = graphic.tail.replace(title + separator, '')
graphic.tail = graphic.tail.replace(caption + separator, '')
graphic.tail = graphic.tail.replace(caption, '')
if not 'id' in graphic.attrib:
graphic.attrib['id'] = u'ID{0}'.format(uuid.uuid4())
graphic_titles.append(title)
graphic_ids.append(graphic.attrib['id'])
paragraphs = tree.xpath('//p')
self.link(graphic_ids, graphic_titles, paragraphs, 'fig')
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
self.run_graphics_sibling()
[docs] def run_tables(self):
self.debug.print_debug(self, u'Attempting to classify captions for table objects')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
tables = tree.xpath('//table-wrap')
table_titles = []
table_ids = []
table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+')
table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+')
separator = ':'
for table in tables:
caption_element = None
use_next = False
use_previous = False
used_title = False
# get the next sibling
p = table.getnext()
pprev = table.getprevious()
old_title = None
if p is not None and p.tag == 'p':
cont = True
for sub in p:
if sub.tag == 'graphic':
cont = False
if cont:
text = manipulate.get_stripped_text(p)
if table_regex_colon.match(text):
use_next = True
separator = ':'
elif table_regex_dot.match(text):
use_next = True
separator = '.'
if not use_next:
cont = True
for sub in pprev:
if sub.tag == 'graphic':
cont = False
if cont:
if pprev is not None and pprev.tag == 'p':
text = manipulate.get_stripped_text(pprev)
if table_regex_colon.match(text):
use_previous = True
separator = ':'
elif table_regex_dot.match(text):
use_previous = True
separator = '.'
if not use_next or use_previous:
# see if the title in this section potentially contains text we can match
parent = table.getparent()
titles = parent.xpath('title')
if len(titles) > 0:
p = titles[0]
text = manipulate.get_stripped_text(p)
if table_regex_colon.match(text):
use_next = True
separator = ':'
used_title = True
elif table_regex_dot.match(text):
use_next = True
separator = '.'
used_title = True
if use_next or use_previous:
if use_next:
text = manipulate.get_stripped_text(p)
else:
text = manipulate.get_stripped_text(pprev)
p = pprev
# likely this is a table identifier
split_title = text.split(separator)
title = split_title[0].strip()
caption = (''.join(split_title[1:])).strip()
# strip all formatting from caption for ease of parsing
# TODO: preserve formatting (far harder)
new_p = etree.Element('p')
new_p.text = caption
if p.tag.endswith('title'):
new_title = etree.Element('title')
new_title.text = ''
old_title = new_title
p.addnext(new_title)
p.getparent().remove(p)
else:
p.getparent().remove(p)
p = new_p
self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title))
title_element = None
# use an existing title element if one exists
try:
title_element = table.xpath('label')[0]
except:
title_element = etree.Element('label')
table.insert(0, title_element)
title_element.text = title
caption_element = etree.Element('caption')
NlmManipulate.append_safe(caption_element, p, self)
table.insert(1, caption_element)
if not 'id' in table.attrib:
table.attrib['id'] = u'ID{0}'.format(uuid.uuid4())
table_titles.append(title)
table_ids.append(table.attrib['id'])
if used_title:
# if we took the title out, then we should move the parent into its previous sibling and then
# strip tags
old_title.tag = 'REMOVE'
etree.strip_elements(tree, 'REMOVE')
section = table.getparent()
previous = section.getprevious()
while previous is not None and not previous.tag.endswith('sec'):
previous = previous.getprevious()
if previous is not None:
previous.append(section)
section.tag = 'REMOVE'
etree.strip_tags(tree, 'REMOVE')
self.debug.print_debug(self, u'Moved table and siblings to previous section')
else:
previous = section.getparent()
if previous is not None and previous.tag.endswith('sec'):
previous.append(section)
section.tag = 'REMOVE'
etree.strip_tags(tree, 'REMOVE')
self.debug.print_debug(self, u'Moved table and siblings to parent section')
# If none of that worked, try to find caption in table rows
if caption_element is None:
table_rows = table.find("table").getchildren()
# Check if first row has fewer columns than others
# Therefore not likely to be data or a header
columns_count = {}
first_column = {}
row_number = 0
for row in table_rows:
row_number += 1
columns_count[row_number] = len(row.getchildren())
try:
first_column[row_number] = row.getchildren()[0].text
except:
first_column[row_number] = ""
fewest_columns = min(columns_count, key=columns_count.get)
if len(columns_count) > 2 and columns_count[1] == fewest_columns and columns_count[2] != fewest_columns:
# If it has fewest columns, also check Levenshtein distance
# To ensure this row is unlike the others
if editdistance.eval(first_column[1], first_column[2]) > editdistance.eval(first_column[2], first_column[3]):
# OK, we have something, move it
caption_element = etree.Element('caption')
caption_element.text = first_column[1]
NlmManipulate.append_safe(table, caption_element, self)
table.find("table").remove(table_rows[0])
paragraphs = tree.xpath('//p')
self.link(table_ids, table_titles, paragraphs, 'table')
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
[docs] def run_ext_link_compliance(self):
self.debug.print_debug(self, u'Attempting to correct any mis-nested graphics elements')
manipulate = NlmManipulate(self.gv)
tree = manipulate.load_dom_tree()
bad_links = tree.xpath('//ext-link/graphic')
for link in bad_links:
link_parent = link.getparent()
parent = link_parent.getparent()
parent.insert(parent.index(link_parent)+1, link)
tree.write(self.gv.nlm_file_path)
tree.write(self.gv.nlm_temp_file_path)
[docs]def main():
args = docopt(__doc__, version='meTypeset 0.1')
bare_gv = GV(args)
if args['--debug']:
bare_gv.debug.enable_debug()
table_classifier_instance = CaptionClassifier(bare_gv)
if args['all'] or args['tables']:
table_classifier_instance.run_tables()
if args['all'] or args['graphics']:
table_classifier_instance.run_graphics()
if args['all'] or args['enforce']:
table_classifier_instance.run_ext_link_compliance()
if __name__ == '__main__':
main()