Source code for bin.sizeclassifier

#!/usr/bin/env python
from teimanipulate import *

[docs]__author__ = "Martin Paul Eve"
[docs]__email__ = "martin@martineve.com"
""" A class that scans for meTypeset size fields in a TEI file. 1.) Identifies a list of sizes 2.) Ascertains the density and likelihood of the size being a heading 3.) Returns a manipulator ready to implement all the changes to the TEI file """ from debug import Debuggable
[docs]class SizeClassifier(Debuggable): def __init__(self, global_variables): self.gv = global_variables self.debug = self.gv.debug self.size_cutoff = int(self.gv.settings.get_setting('minimum-heading-size', self)) self.max_headings = int(self.gv.settings.get_setting('maximum-headings', self)) self.root = 0 self.tree = None Debuggable.__init__(self, 'Size Classifier') @staticmethod
[docs] def get_values(tree, search_attribute): # this function searches the DOM tree for TEI "hi" elements with the specified search_attribute sizes = {} for child in tree.xpath('//tei:hi[@' + search_attribute + '=not("")]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): if child.get(search_attribute) in sizes: sizes[child.get(search_attribute)] += 1 else: sizes[child.get(search_attribute)] = 1 return sizes
@staticmethod
[docs] def get_sizes_ordered(tree): # this function searches the DOM tree for TEI "head" elements with the specified search_attribute sizes_ordered = [] for child in tree.xpath("//tei:head[@meTypesetSize=not('')]", namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): sizes_ordered.append(child.get("meTypesetSize")) return sizes_ordered
[docs] def set_dom_tree(self, filename): p = etree.XMLParser(remove_blank_text=True, resolve_entities=False) self.tree = etree.parse(filename, p) return self.tree
@staticmethod
[docs] def handle_bold_only_paragraph(manipulate, root_size): """ This method looks for paragraphs that contain only bold text. It then transforms them to titles. @param manipulate: a TeiManipulator object @param root_size: the size styling to apply to these elements """ expression = u'//tei:p[(contains(@rend, "bold") or count(tei:hi) = count(tei:hi[contains(@rend, "bold")])) ' \ u'and not(text()[normalize-space()!=""])]/tei:hi' manipulate.change_self_size(expression, str(root_size))
@staticmethod
[docs] def handle_heading(manipulate, attribute, root_size): """ This method looks for paragraphs that contain the specified attribute in their rend tag. It then transforms them to titles. @param manipulate: a TeiManipulator object @param attribute: a string to search for in the rend attribute @param root_size: the size styling to apply to these elements """ expression = u'//tei:p[contains(@rend, "{0}")]'.format(attribute) manipulate.enclose_and_change_self_size(expression, str(root_size), 'p', 'hi')
[docs] def get_sizes(self, tree): sizes = self.get_values(tree, "meTypesetSize") if len(sizes) > 0: self.debug.print_debug(self, u'Explicitly specified size variations and their frequency of ' u'occurrence: {0}'.format(str(sizes))) new_sizes = {} for size, frequency in sizes.items(): if float(frequency) < float(self.max_headings): new_sizes[size] = frequency sizes = new_sizes return sizes
[docs] def correlate_styled_headings(self, manipulate): # reload the DOM tree = self.set_dom_tree(self.gv.tei_file_path) # get a numerical list of explicit size values inside meTypesetSize attributes sizes = self.get_sizes(tree) sorted_list = [] headings = {} # correlate tag sizes specified by true word headings ("heading 1", "heading 2" etc.) to our index for size, frequency in sizes.items(): if float(frequency) < float(self.max_headings) and float(size) > float(self.size_cutoff): sorted_list.append(size) sorted_list = sorted(sorted_list) if len(sorted_list) > 0: for count in range(0, len(sorted_list) - 1): key = u'heading {0}'.format(count + 1) headings[key] = sorted_list[count] key = u'Heading {0}'.format(count + 1) headings[key] = sorted_list[count] for count in range(len(sorted_list) - 1, 8): key = u'heading {0}'.format(count + 1) headings[key] = 100 - 10 * count key = u'Heading {0}'.format(count + 1) headings[key] = 100 - 10 * count else: headings = {'title': 100, 'heading 1': 100, 'heading 2': 90, 'heading 3': 80, 'heading 4': 70, 'heading 5': 60, 'heading 6': 50, 'heading 7': 40, 'heading 8': 30, 'heading 9': 20} headings.update({'Title': 100, 'Heading 1': 100, 'Heading 2': 90, 'Heading 3': 80, 'Heading 4': 70, 'Heading 5': 60, 'Heading 6': 50, 'Heading 7': 40, 'Heading 8': 30, 'Heading 9': 20}) headings.update({'H1': 100, 'H2': 90, 'H3': 80, 'H4': 70, 'H5': 60, 'H6': 50, 'H7': 40, 'H8': 30, 'H9': 20}) for key, value in headings.items(): self.debug.print_debug(self, u'Changing {0} to size {1}'.format(key, value)) self.handle_heading(manipulate, key, float(value)) # reload the DOM tree = self.set_dom_tree(self.gv.tei_file_path) return tree
[docs] def convert_to_headings(self, manipulate, sizes, tree): for size in sizes: if float(size) >= float(self.size_cutoff): # if the size is greater than or equal to 16, treat it as a heading self.debug.print_debug(self, u'Size ({0}) greater ' u'than or equal to {1}. ' u'Treating as a heading.'.format(str(size), str(self.size_cutoff))) # instruct the manipulator to change the parent tag of every tag it finds containing # a "hi" tag with meTypesetSize set to the value found to "title" # so, for example <p><hi meTypesetSize="18">some text</hi></p> # will be transformed to # <title><hi meTypesetSize="18">some text</hi></title> manipulate.change_outer('//tei:hi[@meTypesetSize=\'{0}\']'.format(size), 'head', size) tree = self.set_dom_tree(self.gv.tei_file_path) for normalize in tree.xpath('//tei:cit/tei:quote/tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): normalize.getparent().tag = 'REMOVE' normalize.getparent().getparent().tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug(self, u'Normalizing nested headings inside cit/quote blocks') return tree
[docs] def encapsulate_headings(self, manipulate, tree): titles = tree.xpath('//tei:head[preceding-sibling::node()]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for title in titles: existing_section = title.getparent() new_section = etree.Element('div') sibling = title to_move = [] while sibling is not None: to_move.append(sibling) sibling = sibling.getnext() for sibling in to_move: new_section.append(sibling) existing_section.addnext(new_section) manipulate.save_tree(tree) self.debug.print_debug(self, u'Handling unnested title: ' u'{0}'.format(manipulate.get_stripped_text(title).strip())) manipulate.save_tree(tree)
[docs] def nest_headings(self, manipulate, tree): tree = manipulate.load_dom_tree() stack = [] message = {} for div in tree.xpath('//tei:div', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): title = div.xpath('tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) if len(title) == 0: size = 100 message[div] = 'No title found in this block' else: size = title[0].attrib['meTypesetSize'] message[div] = manipulate.get_stripped_text(title[0]).strip() stack.append((size, div)) first = True position = 0 root_size = None root_div = None dict_thresholds = {} for element in stack: if first: first = False root_size, root_div = element self.debug.print_debug(self, u'Set root size as {0}'.format(root_size)) else: size, div = element previous, previous_div = stack[position - 1] if float(size) > float(root_size): size = float(root_size) # handle an element that is the root size if float(size) == float(root_size): root_div = div dict_thresholds[float(root_size)] = position for item in dict_thresholds.keys(): dict_thresholds[item] = position self.debug.print_debug(self, u'Heading {0} ("{1}") was same size as root. ' u'Resetting stack.'.format(position + 1, message[div])) # handle an element that is smaller than its predecessor elif float(size) < float(previous): addnext = False # traverse up the tree to see if there is an equal size element iteration = position - 1 if not float(size) in dict_thresholds.keys(): dict_thresholds[float(size)] = position while iteration >= dict_thresholds[float(size)]: iterpos, iterdiv = stack[iteration] if float(iterpos) == float(size): previous_div = iterdiv addnext = True break else: iteration -= 1 if addnext: previous_div.addnext(div) else: previous_div.append(div) dict_thresholds[float(size)] = position manipulate.save_tree(tree) self.debug.print_debug(self, u'Moved heading {0} ("{1}") into previous because ' u'it is smaller'.format(position + 1, message[div])) # handle an element that is bigger than its predecessor elif float(size) > float(previous): # traverse up the tree to see if there is an equal size element iteration = position - 1 found = False if not float(size) in dict_thresholds.keys(): dict_thresholds[float(size)] = position while iteration >= dict_thresholds[float(size)]: iterpos, iterdiv = stack[iteration] if float(iterpos) == float(size): previous_div = iterdiv break else: iteration -= 1 previous_div.addnext(div) dict_thresholds[float(size)] = position for item in dict_thresholds.keys(): if float(dict_thresholds[item]) < float(size): dict_thresholds[item] = position manipulate.save_tree(tree) self.debug.print_debug(self, u'Moved heading {0} ("{1}") into previous ' u'because it is bigger'.format(position + 1, message[div])) # handle an element that is the same size as its predecessor elif float(size) == float(previous): previous_div.addnext(div) self.debug.print_debug(self, u'Added heading {0} ("{1}") adjacent to previous because ' u'it is the same size'.format(position + 1, message[div])) position += 1 return stack, tree
[docs] def verify_headings(self, stack, tree): # verify that the stack has not been disordered position = 0 for div in tree.xpath('//tei:div', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): size, verify = stack[position] if verify != div: self.debug.write_error(self, u'Size elements were disordered', '002') self.debug.print_debug(self, u'WARNING: size elements were disordered') return False position += 1 return True
[docs] def remove_empty_headings(self, manipulate, tree): count = 0 for title in tree.xpath('//tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): text = manipulate.get_stripped_text(title).strip() skip = manipulate.contains_graphic(title) if text == '': title.tag = 'REMOVE' count += 1 etree.strip_elements(tree, 'REMOVE') if count > 0: manipulate.save_tree(tree) self.debug.print_debug(self, u'Removed {0} empty titles'.format(count))
[docs] def downgrade_oversize_headings(self, manipulate, tree): for title in tree.xpath('//tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): text = manipulate.get_stripped_text(title) if len(text) > 200: title.tag = 'p' manipulate.save_tree(tree) self.debug.print_debug(self, u'Over-length heading downgraded')
[docs] def handle_capital_only_paragraph(self, manipulate, new_size): tree = manipulate.load_dom_tree() for child in tree.xpath('//tei:p', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): text = manipulate.get_stripped_text(child).strip() regex = re.compile('^[A-Z]+\:$') if regex.match(text) or ('rend' in child.attrib and 'capsall' in child.attrib['rend']): child.attrib['meTypesetSize'] = str(new_size) child.tag = 'head' manipulate.save_tree(tree) self.debug.print_debug(self, u'Changed item {0} to a heading size {1}'.format(text, new_size))
[docs] def handle_single_item_list(self, manipulate, new_size): tree = manipulate.load_dom_tree() for child in tree.xpath('//tei:list[count(tei:item)=1]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}): child.tag = 'REMOVE' for item in child: text = manipulate.get_stripped_text(item) item.attrib['meTypesetSize'] = str(new_size) item.tag = 'head' etree.strip_tags(tree, 'REMOVE') manipulate.save_tree(tree) self.debug.print_debug(self, u'Changed item {0} to a heading size {1}'.format(text, new_size))
[docs] def clean_introduction_headings(self, manipulate): tree = manipulate.load_dom_tree() titles = tree.xpath('//tei:p[following-sibling::*[1][self::tei:cit]]/tei:hi[@meTypesetSize]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for element in titles: text = manipulate.get_stripped_text(element.getparent()).strip() if text.endswith(':'): del element.attrib['meTypesetSize'] manipulate.save_tree(tree) self.debug.print_debug(self, u'Removed heading attribute from {0} as it looks ' u'like a quote introduction'.format(text)) return tree
[docs] def clean_line_breaks(self, manipulate): tree = manipulate.load_dom_tree() titles = tree.xpath('//tei:head/tei:hi[@meTypesetSize][tei:lb] | //tei:head[tei:lb]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for element in titles: total_text = manipulate.get_stripped_text(element) text = element.text if text is None: text = '' last_element = None for item in element: if item.tag.endswith('lb') and text.strip() == '': prev = item.getprevious() if prev is not None: prev.text = item.tail else: element.text = item.tail item.getparent().remove(item) manipulate.save_tree(tree) self.debug.print_debug(self, u'Removed unneeded lb from {0}'.format(total_text)) if item.text is not None: text += item.text if last_element is not None and last_element.tail is not None: text = text + last_element.tail last_element = item return tree
[docs] def renest_headings(self, manipulate, tree): titles = tree.xpath('//tei:div[count(*) = 1][tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for element in titles: text = manipulate.get_stripped_text(element).strip() next_element = element.getnext() while next_element is not None and not next_element in titles: next_to_add_element = next_element.getnext() element.append(next_element) next_element = next_to_add_element manipulate.save_tree(tree) self.debug.print_debug(self, u'Re-nested element under {0}'.format(text)) return tree
[docs] def final_headings(self, manipulate, tree): sections = tree.xpath('//tei:div[not(tei:head)]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}) for element in sections: if len(element) > 0: title = element[0] if title is not None: next_element = title bolded = True if next_element.text != '' and 'rend' in next_element.attrib and not 'bold' in next_element.attrib['rend']: bolded = False if len(next_element) > 0 and bolded: for pelement in next_element: if pelement.tag.endswith('fig') or pelement.tag.endswith('graphic'): bolded = False break text = manipulate.get_stripped_text(pelement).strip() if (text != '' and 'rend' in pelement.attrib and not 'bold' in pelement.attrib['rend']) \ or (text != '' and not 'rend' in pelement.attrib): bolded = False break if bolded: next_element.tag = 'head' self.debug.print_debug(self, u'Replaced empty title with bolded sibling') manipulate.save_tree(tree)
[docs] def run(self): if int(self.gv.settings.args['--aggression']) < int(self.gv.settings.get_setting('sizeclassifier', self, domain='aggression')): self.debug.print_debug(self, u'Aggression level too low: exiting module.') return manipulate = TeiManipulate(self.gv) # transform bolded paragraphs into size-attributes with an extremely high threshold (so will be thought of as # root nodes) self.handle_bold_only_paragraph(manipulate, 100) # if a paragraph only contains capitals followed by a colon, make it a heading (root node size) self.handle_capital_only_paragraph(manipulate, 100) # if a list contains only a single item, make it a heading (root node size) self.handle_single_item_list(manipulate, 100) tree = self.correlate_styled_headings(manipulate) # this deals with cases where the user has given a styled heading ending with a colon # immediately before a disp-quote tree = self.clean_introduction_headings(manipulate) # refresh the size list sizes = self.get_sizes(tree) tree = self.convert_to_headings(manipulate, sizes, tree) # assign IDs to every single heading tag for easy manipulation heading_count = manipulate.tag_headings() # this deals with cases where line breaks exist within <head> tags but there is no text before; we remove them tree = self.clean_line_breaks(manipulate) tree = manipulate.load_dom_tree() self.downgrade_oversize_headings(manipulate, tree) self.remove_empty_headings(manipulate, tree) tree = manipulate.load_dom_tree() self.encapsulate_headings(manipulate, tree) backup_tree = etree.tostring(tree, encoding="unicode") stack, tree = self.nest_headings(manipulate, tree) if not self.verify_headings(stack, tree): # something went very wrong in the stacking of elements # revert to the backup tree self.debug.print_debug(self, u'Reverting to backup tree as size classification failed') tree = etree.fromstring(backup_tree) manipulate.save_tree(tree) # re-nest headings where a single heading and nothing else is found within a section backup_tree = etree.tostring(tree, encoding="unicode") tree = self.renest_headings(manipulate, tree) if not self.verify_headings(stack, tree): # something went very wrong in the stacking of elements # revert to the backup tree self.debug.print_debug(self, u'Reverting to backup tree as size classification failed') tree = etree.fromstring(backup_tree) manipulate.save_tree(tree) self.final_headings(manipulate, tree)