Source code for bin.listclassifier

#!/usr/bin/env python
from teimanipulate import *

[docs]__author__ = "Martin Paul Eve"
[docs]__email__ = "martin@martineve.com"
""" A class that looks for common list encodings and handles them properly. 1.) Identifies lists 2.) Converts them to proper TEI list format """ from debug import Debuggable import math from copy import copy import re
[docs]class ListClassifier(Debuggable): def __init__(self, global_variables): self.gv = global_variables self.debug = self.gv.debug self.skiplist = [] Debuggable.__init__(self, 'List Classifier') @staticmethod
[docs] def set_dom_tree(filename): p = etree.XMLParser(remove_blank_text=True, resolve_entities=False) return etree.parse(filename, p)
[docs] def handle_reference_item(self, element, elements, in_list_run, iteration, list_element, offset, to_append): if element in self.skiplist: return iteration manipulate = TeiManipulate(self.gv) iteration += 1 # treat as ref list next_text = manipulate.get_stripped_text(element.getnext()) self.debug.print_debug(self, u'Handling reference element {0}'.format(element.text)) if not in_list_run: list_element = etree.Element('ref') list_element.attrib['target'] = 'None' to_append = None in_list_run = True element.addprevious(list_element) if not element.getnext() is None or next_text == '': if next_text.startswith(u'[') and not element.getnext() in elements: # this element is the last in this list in_list_run = False to_append = element.getnext() elif not next_text.startswith(u'['): while not next_text.startswith(u'['): # anomalous ref if next_text != '': self.debug.print_debug(self, u'Ref missing marker {0}'.format(next_text)) self.skiplist.append(element.getnext()) element.getnext().tag = 'hi' Manipulate.append_safe(element, element.getnext(), self) next_text = manipulate.get_stripped_text(element.getnext()) else: # this can happen with italics self.debug.print_debug(self, u'Ref run/anomaly on {0}'.format(next_text)) for sub_element in element.getnext(): Manipulate.append_safe(element, sub_element, self) in_list_run = False to_append = element.getnext() element.tag = 'p' element.attrib['rend'] = u'Bibliography' element.text = element.text[(int(offset) + int(math.floor(int(iteration / 10)))):] Manipulate.append_safe(list_element, element, self) if not in_list_run: if not to_append is None: if not to_append.text is None: to_append.tag = 'p' to_append.attrib['rend'] = u'Bibliography' to_append.text = to_append.text[(int(offset) + int(math.floor(int(iteration / 10)))):] self.debug.print_debug(self, u'Appending ref element: {0}'.format(to_append.text)) Manipulate.append_safe(list_element, to_append, self) return iteration
[docs] def handle_footnote_item(self, element, elements, in_list_run, iteration, list_element, offset, to_append): iteration += 1 # treat as fn-list list self.debug.print_debug(self, u'Handling footnote element {0}'.format(element.text)) if not in_list_run: list_element = etree.Element('fn-group') to_append = None in_list_run = True element.addprevious(list_element) if not element.getnext().text is None: if element.getnext().text.startswith(u'[') and not element.getnext() in elements: # this element is the last in this list in_list_run = False to_append = element.getnext() elif not element.getnext().text.startswith(u'['): # anomalous ref self.debug.print_debug(self, u'Footnote missing marker {0}'.format(element.getnext().text)) element.text = element.text + element.getnext().text element.getnext().text = '' else: # this can happen with italics self.debug.print_debug(self, u'Footnote run/anomaly on {0}'.format(element.getnext())) for sub_element in element.getnext(): Manipulate.append_safe(element, sub_element, self) in_list_run = False to_append = element.getnext() element.tag = 'note' element.attrib['id'] = 'fn_from_list{0}'.format(iteration) element.text = element.text[(int(offset) + int(math.floor(int(iteration / 10)))):] Manipulate.append_safe(list_element, element, self) if not in_list_run: if not to_append is None: if not to_append.text is None: to_append.tag = 'note' to_append.attrib['id'] = 'fn_from_list{0}'.format(iteration + 1) to_append.text = to_append.text[(int(offset) + int(math.floor(int(iteration / 10)))):] self.debug.print_debug(self, u'Appending fn element: {0}'.format(to_append.text)) Manipulate.append_safe(list_element, to_append, self) return iteration, list_element
[docs] def process_enclosed_ref_list(self, tree, manipulate, treestring): if not u'>[' in treestring and not u'> [' in treestring: self.debug.print_debug(self, u'Leaving enclosed reference processing') return # find it we have a list of enclosed references # todo work with hi tags expression = u'//tei:div[last()]//tei:p[starts-with(.,"[")][following-sibling::tei:p[starts-with(.,"[")]]' elements = tree.xpath(expression, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) in_list_run = False list_element = None to_append = None iteration = 0 offset = 0 process = True last_element = None acted = False # ascertain if there are other document references. If so, this is probably a footnote footnote_test = '//text()[contains(self::text(), "[1]")]' footnotes = tree.xpath(footnote_test, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) is_footnote = len(footnotes) > 1 if not is_footnote: footnote_test = '//text()[contains(self::text(), "[1,")]' footnotes = tree.xpath(footnote_test, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) is_footnote = len(footnotes) > 1 # append an attribute to the preceding element to signal a return point for references if they were wrongly # classified if is_footnote: self.debug.print_debug(self, u'Enclosed footnotes detected') else: self.debug.print_debug(self, u'No enclosed footnotes detected') if not is_footnote and len(elements) > 0: prev = elements[0].getprevious() if prev is not None: prev.attrib['rend'] = 'ref-list-before' else: prev = elements[0].getparent() if prev is not None: prev.attrib['rend'] = 'ref-list-parent' for element in elements: if iteration == 0: if not elements[0].text.startswith(u'[1] '): self.debug.print_debug(self, u'Reference list PANIC: {0}'.format(element.text)) break else: offset = 4 if is_footnote: self.debug.print_debug(self, u'Found in-text footnotes: processing') iteration, list_element = self.handle_footnote_item(element, elements, in_list_run, iteration, list_element, offset, to_append) else: acted = True iteration = self.handle_reference_item(element, elements, in_list_run, iteration, list_element, offset, to_append) if acted: self.gv.used_square_reference_method = True manipulate.enclose_bibliography_tags('//tei:p[@rend="Bibliography"]', 'back', 'div', 'type', 'bibliogr') manipulate.save_tree(tree) self.debug.print_debug(self, u'Enclosing bibliography') if is_footnote: back = manipulate.find_or_create_element(tree, 'back', '//tei:body', True) Manipulate.append_safe(back, list_element, self) new_element_list = list_element for count in range(1, iteration + 2): footnote_test = '//tei:p//text()[contains(self::text(), "[{0}]")]'.format(count) footnote = tree.xpath(footnote_test, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})[0] note = copy(new_element_list[count - 1]) split = footnote.split("[{0}]".format(count)) footnote.getparent().text = split[0] manipulate.append_safe(footnote.getparent(), note, self) note.tail = split[1] back.remove(list_element) manipulate.save_tree(tree)
[docs] def process_dash_list(self, tree, manipulate, treestring): if not u'>-' in treestring: return # select all p elements followed by another p element expression = u'//tei:p[starts-with(.,"- ")][following-sibling::tei:p[starts-with(.,"- ")]]' elements = tree.xpath(expression, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) in_list_run = False list_element = None to_append = None for element in elements: self.debug.print_debug(self, u'Handling list element {0}'.format(element.text)) if not in_list_run: list_element = etree.Element('list') to_append = None in_list_run = True element.addprevious(list_element) if not element.getnext().text is None: if element.getnext().text.startswith(u'- ') and not element.getnext() in elements: # this element is the last in this list in_list_run = False to_append = element.getnext() elif not element.getnext().text.startswith(u'- ') and not element.getnext() in elements: in_list_run = False to_append = None else: # this element is the last in this list self.debug.print_debug(self, u'Ending list run on {0}'.format(element.getnext())) in_list_run = False to_append = element.getnext() element.tag = 'item' element.text = element.text[2:] Manipulate.append_safe(list_element, element, self) if not in_list_run: if not to_append is None: if not to_append.text is None: to_append.tag = 'item' to_append.text = to_append.text[2:] self.debug.print_debug(self, u'Appending final list element: {0}'.format(to_append.text)) Manipulate.append_safe(list_element, to_append, self) manipulate.save_tree(tree)
[docs] def process_superscript_footnotes(self, tree, manipulate): self.debug.print_debug(self, u'Scanning for superscripted footnote entries') superscripts = reversed(tree.xpath('//*[@rend="superscript"]')) footnote_list = [] footnote_text = [] for item in superscripts: text = manipulate.get_stripped_text(item).strip() if self.gv.is_number(text): footnote_list.append(item) footnote_text.append(text) if len(footnote_list) == 0: self.debug.print_debug(self, u'Found no superscripted footnote entries') return # if it doesn't work, this is an expensive operation # however, once we've found all we can stop, so it's worth doing try: footnote = int(footnote_text[len(footnote_text) - 1].strip()) except: self.debug.print_debug(self, u'Unable to parse last footnote as number. Leaving footnote classifier.') return count = len(footnote_text) - 1 offset = footnote failures = 0 for number in range(int(footnote_text[len(footnote_text) - 1].strip()), int(footnote_text[0].strip())): if not str(int(number + (offset - 1))) in footnote_text and \ (int(footnote_text[count]) != int(number + (offset - 1))): self.debug.print_debug(self, u'Expected footnote {0}. Found {1}.'.format(int(number + (offset - 1)), footnote_text[count])) failures += 1 if failures > 1: self.debug.print_debug(self, u'Found more than 1 misaligned footnote. Bailing.') return else: # we need to try and find the number in some text, somewhere, then re-superscript it # it should occur twice only: once for the number itself and once for the paragraph that will be # made into a footnote. However, if the method is a list at the end of the document it will be # only a single occurrence text_blocks = tree.xpath('//text()[contains(self::text()' ', "{0}")]'.format(int(number + (offset - 1)))) if len(text_blocks) >= 1: parsed = False for block_to_change in text_blocks: # (1)[^\d] ref_match = re.compile('^.*(?P<number>{0})[^\d].*'.format(str(int(number + (offset - 1))))) result = ref_match.match(block_to_change.getparent().text) if result: self.debug.print_debug(self, u'Found potential point for footnote #{0}.' u' Superscripting'.format(int(number + (offset - 1)))) parent = block_to_change.getparent() before_after = parent.text.split(str(int(number + (offset - 1)))) parent.text = before_after[0] encapsulate = etree.Element('sup') encapsulate.text = str(int(number + (offset - 1))) encapsulate.tail = ''.join(before_after[1:]) Manipulate.append_safe(parent, encapsulate, self) footnote_list.insert(count, encapsulate) footnote_text.insert(count, str(int(number + (offset - 1)))) offset += 1 parsed = True break if not parsed: self.debug.print_debug(self, u'Unable to find footnote #{0}'.format(int(number + (offset - 1)))) return else: self.debug.print_debug(self, u'Unable to find footnote #{0}'.format(int(number + (offset - 1)))) return pass count -= 1 whole_document = reversed(tree.xpath('//tei:p[@rend="Normal" or not(@rend)]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})) found = [] footnote = int(footnote_text[0].strip()) for item in whole_document: text = manipulate.get_stripped_text(item).strip() if text.startswith(str(footnote)): found.append(item) footnote -= 1 if len(found) == len(footnote_list): break if len(found) == 0: # last ditch: see if there is an ordered list near the end of the document that has the same number of # elements as the footnote list self.debug.print_debug(self, u'No luck locating footnote text. Attempting list method.') lists = reversed(tree.xpath('//tei:list[@type="ordered"]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})) first = True for list in lists: if not first: break else: first = True for item in list: found.append(item) if len(found) == len(footnote_list): break # note: all lists are reversed by this point (ie go from the back of the document upwards) if len(found) == len(footnote_list): self.debug.print_debug(self, u'Found {0} superscripted footnote entries'.format(len(found))) current = 0 for footnote in footnote_list: # null the text (this will be generated automatically) if Manipulate.append_safe(footnote, found[current], self): footnote.text = '' footnote.tag = 'note' footnote.attrib['id'] = 'fn_from_list{0}'.format(current) replace_regex = '^({0}[\.\s\)]*)'.format(len(found) - current) if found[current].text and not found[current].text.strip().startswith(str(len(found) - current)): # in this case the footnote text is inside a sub-element for sub_element in found: if sub_element.text and sub_element.text.strip().startswith(str(len(found) - current)): sub_element.text = re.sub(replace_regex, '', sub_element.text) break pass elif found[current].text: found[current].text = re.sub(replace_regex, '', found[current].text) manipulate.save_tree(tree) self.debug.print_debug(self, u'Processing footnote {0}'.format(current + 1)) else: # failed to add a footnote self.debug.print_debug(self, u'Footnote failure. Aborting process.') return current += 1 else: self.debug.print_debug(self, u'Found {0} superscripted footnote entries but could not correlate this with' ' {1} paragraph entries'.format(len(footnote_list), len(found))) manipulate.save_tree(tree)
[docs] def process_number_list(self, tree, manipulate, treestring): if not u'>1' and not u'>(' in treestring: return # select all p elements followed by another p element expression = u'//tei:p[contains("(0123456789ivxlcmdIVXLCMD", substring(., 1, 1)) and not(@rend="Bibliography")]' elements = tree.xpath(expression, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) list_element = None iteration = 0 number = None number_match = re.compile('^\s*(?P<rn>\(?\s*(?P<number>\d+)[\.\s\)]+).+') roman_match = re.compile('^(?P<rn>M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})[\.\s\)]+).+', re.IGNORECASE) for element in elements: text = manipulate.get_stripped_text(element) match = number_match.match(text) roman_result = roman_match.match(text) cont = False if match or roman_result: if match: number = match.groups('rn')[1] offset = len(match.groups('rn')[0]) else: offset = len(roman_result.groups('rn')[0]) iteration += 1 roman = self.int_to_roman(iteration) if not number == str(iteration) and not text.startswith(roman) and not text.startswith(roman.lower()): if iteration > 1: old_it = iteration iteration = 1 roman = self.int_to_roman(iteration) if not number == str(iteration) and not text.startswith(roman) \ and not text.startswith(roman.lower()): self.debug.print_debug(self, u'List element was misordered and did not start new list. ' u'Expected {0}'.format(iteration)) iteration = old_it - 1 else: self.debug.print_debug(self, u'Starting new numeric list from cue: {0}'.format(text)) cont = True else: cont = True if iteration > 1: self.debug.print_debug(self, u'Handling list element {0}'.format(text)) else: next_start = iteration + 1 next_element_index = elements.index(element) + 1 if next_element_index > len(elements): cont = False else: next_element = elements[next_element_index] next_text = manipulate.get_stripped_text(next_element) next_match = number_match.match(next_text) next_roman_result = roman_match.match(next_text) next_number = None if next_match or next_roman_result: if next_match: next_number = next_match.groups('rn')[1] next_roman = self.int_to_roman(iteration) if not next_number == str(iteration + 1) and not next_text.startswith(next_roman)\ and not next_text.startswith(next_roman.lower()): cont = False else: cont = False if cont: self.debug.print_debug(self, u'Starting new numeric list from cue: {0}'.format(text)) else: self.debug.print_debug(self, u'Cannot start list on last element: {0}'.format(text)) if cont: if iteration == 1: # start new list list_element = etree.Element('list') list_element.attrib[u'type'] = u'ordered' element.addprevious(list_element) element.tag = 'item' if element.text: element.text = element.text[(int(offset) + int(math.floor(int(iteration/10)))):] else: element.text = text[(int(offset) + int(math.floor(int(iteration/10)))):] Manipulate.append_safe(list_element, element, self) elif list_element is not None: element.tag = 'item' if element.text: element.text = element.text[(int(offset) + int(math.floor(int(iteration/10)))):] else: element.text = text[(int(offset) + int(math.floor(int(iteration/10)))):] Manipulate.append_safe(list_element, element, self) else: self.debug.print_debug(self, u'Reference list PANIC: {0}'.format(text)) manipulate.save_tree(tree)
[docs] def int_to_roman(self, input): if type(input) != type(1): raise TypeError("expected integer, got %s" % type(input)) if not 0 < input < 4000: raise ValueError("Argument must be between 1 and 3999") ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1) nums = ('M', 'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I') result = "" for i in range(len(ints)): count = int(input / ints[i]) result += nums[i] * count input -= ints[i] * count return result
[docs] def run(self): if int(self.gv.settings.args['--aggression']) < int(self.gv.settings.get_setting('listclassifier', self, domain='aggression')): self.debug.print_debug(self, u'Aggression level too low: exiting module.') return dash_lists = self.gv.settings.get_setting('dash-lists', self) == "True" bracket_refs = self.gv.settings.get_setting('bracket-references-and-footnotes', self) == "True" superscripted_footnotes = self.gv.settings.get_setting('superscripted-footnotes', self) == "True" if not dash_lists and not bracket_refs and not superscripted_footnotes: return # load the DOM tree = self.set_dom_tree(self.gv.tei_file_path) manipulate = TeiManipulate(self.gv) string_version = etree.tostring(tree, encoding="unicode") # look for dash separated list # - Item 1 # - Item 2 if dash_lists: self.process_dash_list(tree, manipulate, string_version) if int(self.gv.settings.args['--aggression']) >= 10 and bracket_refs: backup_tree = copy(tree) try: # look for footnote list [1], [2] etc. self.process_enclosed_ref_list(tree, manipulate, string_version) except: self.debug.print_debug(self, u'Error processing footnote or reference list. Reverting to backup tree') tree = backup_tree manipulate.save_tree(tree) if superscripted_footnotes: self.process_superscript_footnotes(tree, manipulate) self.process_number_list(tree, manipulate, string_version)