#!/usr/bin/env python
from teimanipulate import *
[docs]__author__ = "Martin Paul Eve"
[docs]__email__ = "martin@martineve.com"
"""
A class that scans for meTypeset size fields in a TEI file.
1.) Identifies a list of sizes
2.) Ascertains the density and likelihood of the size being a heading
3.) Returns a manipulator ready to implement all the changes to the TEI file
"""
from debug import Debuggable
[docs]class SizeClassifier(Debuggable):
def __init__(self, global_variables):
self.gv = global_variables
self.debug = self.gv.debug
self.size_cutoff = int(self.gv.settings.get_setting('minimum-heading-size', self))
self.max_headings = int(self.gv.settings.get_setting('maximum-headings', self))
self.root = 0
self.tree = None
Debuggable.__init__(self, 'Size Classifier')
@staticmethod
[docs] def get_values(tree, search_attribute):
# this function searches the DOM tree for TEI "hi" elements with the specified search_attribute
sizes = {}
for child in tree.xpath('//tei:hi[@' + search_attribute + '=not("")]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
if child.get(search_attribute) in sizes:
sizes[child.get(search_attribute)] += 1
else:
sizes[child.get(search_attribute)] = 1
return sizes
@staticmethod
[docs] def get_sizes_ordered(tree):
# this function searches the DOM tree for TEI "head" elements with the specified search_attribute
sizes_ordered = []
for child in tree.xpath("//tei:head[@meTypesetSize=not('')]",
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
sizes_ordered.append(child.get("meTypesetSize"))
return sizes_ordered
[docs] def set_dom_tree(self, filename):
p = etree.XMLParser(remove_blank_text=True, resolve_entities=False)
self.tree = etree.parse(filename, p)
return self.tree
@staticmethod
[docs] def handle_bold_only_paragraph(manipulate, root_size):
"""
This method looks for paragraphs that contain only bold text. It then transforms them to titles.
@param manipulate: a TeiManipulator object
@param root_size: the size styling to apply to these elements
"""
expression = u'//tei:p[(contains(@rend, "bold") or count(tei:hi) = count(tei:hi[contains(@rend, "bold")])) ' \
u'and not(text()[normalize-space()!=""])]/tei:hi'
manipulate.change_self_size(expression, str(root_size))
@staticmethod
[docs] def handle_heading(manipulate, attribute, root_size):
"""
This method looks for paragraphs that contain the specified attribute in their rend tag.
It then transforms them to titles.
@param manipulate: a TeiManipulator object
@param attribute: a string to search for in the rend attribute
@param root_size: the size styling to apply to these elements
"""
expression = u'//tei:p[contains(@rend, "{0}")]'.format(attribute)
manipulate.enclose_and_change_self_size(expression, str(root_size), 'p', 'hi')
[docs] def get_sizes(self, tree):
sizes = self.get_values(tree, "meTypesetSize")
if len(sizes) > 0:
self.debug.print_debug(self,
u'Explicitly specified size variations and their frequency of '
u'occurrence: {0}'.format(str(sizes)))
new_sizes = {}
for size, frequency in sizes.items():
if float(frequency) < float(self.max_headings):
new_sizes[size] = frequency
sizes = new_sizes
return sizes
[docs] def correlate_styled_headings(self, manipulate):
# reload the DOM
tree = self.set_dom_tree(self.gv.tei_file_path)
# get a numerical list of explicit size values inside meTypesetSize attributes
sizes = self.get_sizes(tree)
sorted_list = []
headings = {}
# correlate tag sizes specified by true word headings ("heading 1", "heading 2" etc.) to our index
for size, frequency in sizes.items():
if float(frequency) < float(self.max_headings) and float(size) > float(self.size_cutoff):
sorted_list.append(size)
sorted_list = sorted(sorted_list)
if len(sorted_list) > 0:
for count in range(0, len(sorted_list) - 1):
key = u'heading {0}'.format(count + 1)
headings[key] = sorted_list[count]
key = u'Heading {0}'.format(count + 1)
headings[key] = sorted_list[count]
for count in range(len(sorted_list) - 1, 8):
key = u'heading {0}'.format(count + 1)
headings[key] = 100 - 10 * count
key = u'Heading {0}'.format(count + 1)
headings[key] = 100 - 10 * count
else:
headings = {'title': 100, 'heading 1': 100, 'heading 2': 90, 'heading 3': 80, 'heading 4': 70,
'heading 5': 60, 'heading 6': 50, 'heading 7': 40, 'heading 8': 30, 'heading 9': 20}
headings.update({'Title': 100, 'Heading 1': 100, 'Heading 2': 90, 'Heading 3': 80,
'Heading 4': 70, 'Heading 5': 60, 'Heading 6': 50, 'Heading 7': 40,
'Heading 8': 30, 'Heading 9': 20})
headings.update({'H1': 100, 'H2': 90, 'H3': 80, 'H4': 70, 'H5': 60, 'H6': 50, 'H7': 40,
'H8': 30, 'H9': 20})
for key, value in headings.items():
self.debug.print_debug(self, u'Changing {0} to size {1}'.format(key, value))
self.handle_heading(manipulate, key, float(value))
# reload the DOM
tree = self.set_dom_tree(self.gv.tei_file_path)
return tree
[docs] def convert_to_headings(self, manipulate, sizes, tree):
for size in sizes:
if float(size) >= float(self.size_cutoff):
# if the size is greater than or equal to 16, treat it as a heading
self.debug.print_debug(self,
u'Size ({0}) greater '
u'than or equal to {1}. '
u'Treating as a heading.'.format(str(size),
str(self.size_cutoff)))
# instruct the manipulator to change the parent tag of every tag it finds containing
# a "hi" tag with meTypesetSize set to the value found to "title"
# so, for example <p><hi meTypesetSize="18">some text</hi></p>
# will be transformed to
# <title><hi meTypesetSize="18">some text</hi></title>
manipulate.change_outer('//tei:hi[@meTypesetSize=\'{0}\']'.format(size), 'head', size)
tree = self.set_dom_tree(self.gv.tei_file_path)
for normalize in tree.xpath('//tei:cit/tei:quote/tei:head',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
normalize.getparent().tag = 'REMOVE'
normalize.getparent().getparent().tag = 'REMOVE'
etree.strip_tags(tree, 'REMOVE')
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Normalizing nested headings inside cit/quote blocks')
return tree
[docs] def encapsulate_headings(self, manipulate, tree):
titles = tree.xpath('//tei:head[preceding-sibling::node()]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for title in titles:
existing_section = title.getparent()
new_section = etree.Element('div')
sibling = title
to_move = []
while sibling is not None:
to_move.append(sibling)
sibling = sibling.getnext()
for sibling in to_move:
new_section.append(sibling)
existing_section.addnext(new_section)
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Handling unnested title: '
u'{0}'.format(manipulate.get_stripped_text(title).strip()))
manipulate.save_tree(tree)
[docs] def nest_headings(self, manipulate, tree):
tree = manipulate.load_dom_tree()
stack = []
message = {}
for div in tree.xpath('//tei:div', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
title = div.xpath('tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
if len(title) == 0:
size = 100
message[div] = 'No title found in this block'
else:
size = title[0].attrib['meTypesetSize']
message[div] = manipulate.get_stripped_text(title[0]).strip()
stack.append((size, div))
first = True
position = 0
root_size = None
root_div = None
dict_thresholds = {}
for element in stack:
if first:
first = False
root_size, root_div = element
self.debug.print_debug(self, u'Set root size as {0}'.format(root_size))
else:
size, div = element
previous, previous_div = stack[position - 1]
if float(size) > float(root_size):
size = float(root_size)
# handle an element that is the root size
if float(size) == float(root_size):
root_div = div
dict_thresholds[float(root_size)] = position
for item in dict_thresholds.keys():
dict_thresholds[item] = position
self.debug.print_debug(self, u'Heading {0} ("{1}") was same size as root. '
u'Resetting stack.'.format(position + 1,
message[div]))
# handle an element that is smaller than its predecessor
elif float(size) < float(previous):
addnext = False
# traverse up the tree to see if there is an equal size element
iteration = position - 1
if not float(size) in dict_thresholds.keys():
dict_thresholds[float(size)] = position
while iteration >= dict_thresholds[float(size)]:
iterpos, iterdiv = stack[iteration]
if float(iterpos) == float(size):
previous_div = iterdiv
addnext = True
break
else:
iteration -= 1
if addnext:
previous_div.addnext(div)
else:
previous_div.append(div)
dict_thresholds[float(size)] = position
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Moved heading {0} ("{1}") into previous because '
u'it is smaller'.format(position + 1,
message[div]))
# handle an element that is bigger than its predecessor
elif float(size) > float(previous):
# traverse up the tree to see if there is an equal size element
iteration = position - 1
found = False
if not float(size) in dict_thresholds.keys():
dict_thresholds[float(size)] = position
while iteration >= dict_thresholds[float(size)]:
iterpos, iterdiv = stack[iteration]
if float(iterpos) == float(size):
previous_div = iterdiv
break
else:
iteration -= 1
previous_div.addnext(div)
dict_thresholds[float(size)] = position
for item in dict_thresholds.keys():
if float(dict_thresholds[item]) < float(size):
dict_thresholds[item] = position
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Moved heading {0} ("{1}") into previous '
u'because it is bigger'.format(position + 1,
message[div]))
# handle an element that is the same size as its predecessor
elif float(size) == float(previous):
previous_div.addnext(div)
self.debug.print_debug(self, u'Added heading {0} ("{1}") adjacent to previous because '
u'it is the same size'.format(position + 1,
message[div]))
position += 1
return stack, tree
[docs] def verify_headings(self, stack, tree):
# verify that the stack has not been disordered
position = 0
for div in tree.xpath('//tei:div', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
size, verify = stack[position]
if verify != div:
self.debug.write_error(self, u'Size elements were disordered', '002')
self.debug.print_debug(self, u'WARNING: size elements were disordered')
return False
position += 1
return True
[docs] def remove_empty_headings(self, manipulate, tree):
count = 0
for title in tree.xpath('//tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
text = manipulate.get_stripped_text(title).strip()
skip = manipulate.contains_graphic(title)
if text == '':
title.tag = 'REMOVE'
count += 1
etree.strip_elements(tree, 'REMOVE')
if count > 0:
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Removed {0} empty titles'.format(count))
[docs] def downgrade_oversize_headings(self, manipulate, tree):
for title in tree.xpath('//tei:head', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
text = manipulate.get_stripped_text(title)
if len(text) > 200:
title.tag = 'p'
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Over-length heading downgraded')
[docs] def handle_capital_only_paragraph(self, manipulate, new_size):
tree = manipulate.load_dom_tree()
for child in tree.xpath('//tei:p', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
text = manipulate.get_stripped_text(child).strip()
regex = re.compile('^[A-Z]+\:$')
if regex.match(text) or ('rend' in child.attrib and 'capsall' in child.attrib['rend']):
child.attrib['meTypesetSize'] = str(new_size)
child.tag = 'head'
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Changed item {0} to a heading size {1}'.format(text, new_size))
[docs] def handle_single_item_list(self, manipulate, new_size):
tree = manipulate.load_dom_tree()
for child in tree.xpath('//tei:list[count(tei:item)=1]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
child.tag = 'REMOVE'
for item in child:
text = manipulate.get_stripped_text(item)
item.attrib['meTypesetSize'] = str(new_size)
item.tag = 'head'
etree.strip_tags(tree, 'REMOVE')
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Changed item {0} to a heading size {1}'.format(text, new_size))
[docs] def clean_introduction_headings(self, manipulate):
tree = manipulate.load_dom_tree()
titles = tree.xpath('//tei:p[following-sibling::*[1][self::tei:cit]]/tei:hi[@meTypesetSize]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for element in titles:
text = manipulate.get_stripped_text(element.getparent()).strip()
if text.endswith(':'):
del element.attrib['meTypesetSize']
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Removed heading attribute from {0} as it looks '
u'like a quote introduction'.format(text))
return tree
[docs] def clean_line_breaks(self, manipulate):
tree = manipulate.load_dom_tree()
titles = tree.xpath('//tei:head/tei:hi[@meTypesetSize][tei:lb] | //tei:head[tei:lb]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for element in titles:
total_text = manipulate.get_stripped_text(element)
text = element.text
if text is None:
text = ''
last_element = None
for item in element:
if item.tag.endswith('lb') and text.strip() == '':
prev = item.getprevious()
if prev is not None:
prev.text = item.tail
else:
element.text = item.tail
item.getparent().remove(item)
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Removed unneeded lb from {0}'.format(total_text))
if item.text is not None:
text += item.text
if last_element is not None and last_element.tail is not None:
text = text + last_element.tail
last_element = item
return tree
[docs] def renest_headings(self, manipulate, tree):
titles = tree.xpath('//tei:div[count(*) = 1][tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for element in titles:
text = manipulate.get_stripped_text(element).strip()
next_element = element.getnext()
while next_element is not None and not next_element in titles:
next_to_add_element = next_element.getnext()
element.append(next_element)
next_element = next_to_add_element
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Re-nested element under {0}'.format(text))
return tree
[docs] def final_headings(self, manipulate, tree):
sections = tree.xpath('//tei:div[not(tei:head)]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for element in sections:
if len(element) > 0:
title = element[0]
if title is not None:
next_element = title
bolded = True
if next_element.text != '' and 'rend' in next_element.attrib and not 'bold' in next_element.attrib['rend']:
bolded = False
if len(next_element) > 0 and bolded:
for pelement in next_element:
if pelement.tag.endswith('fig') or pelement.tag.endswith('graphic'):
bolded = False
break
text = manipulate.get_stripped_text(pelement).strip()
if (text != '' and 'rend' in pelement.attrib and not 'bold' in pelement.attrib['rend']) \
or (text != '' and not 'rend' in pelement.attrib):
bolded = False
break
if bolded:
next_element.tag = 'head'
self.debug.print_debug(self, u'Replaced empty title with bolded sibling')
manipulate.save_tree(tree)
[docs] def run(self):
if int(self.gv.settings.args['--aggression']) < int(self.gv.settings.get_setting('sizeclassifier', self,
domain='aggression')):
self.debug.print_debug(self, u'Aggression level too low: exiting module.')
return
manipulate = TeiManipulate(self.gv)
# transform bolded paragraphs into size-attributes with an extremely high threshold (so will be thought of as
# root nodes)
self.handle_bold_only_paragraph(manipulate, 100)
# if a paragraph only contains capitals followed by a colon, make it a heading (root node size)
self.handle_capital_only_paragraph(manipulate, 100)
# if a list contains only a single item, make it a heading (root node size)
self.handle_single_item_list(manipulate, 100)
tree = self.correlate_styled_headings(manipulate)
# this deals with cases where the user has given a styled heading ending with a colon
# immediately before a disp-quote
tree = self.clean_introduction_headings(manipulate)
# refresh the size list
sizes = self.get_sizes(tree)
tree = self.convert_to_headings(manipulate, sizes, tree)
# assign IDs to every single heading tag for easy manipulation
heading_count = manipulate.tag_headings()
# this deals with cases where line breaks exist within <head> tags but there is no text before; we remove them
tree = self.clean_line_breaks(manipulate)
tree = manipulate.load_dom_tree()
self.downgrade_oversize_headings(manipulate, tree)
self.remove_empty_headings(manipulate, tree)
tree = manipulate.load_dom_tree()
self.encapsulate_headings(manipulate, tree)
backup_tree = etree.tostring(tree, encoding="unicode")
stack, tree = self.nest_headings(manipulate, tree)
if not self.verify_headings(stack, tree):
# something went very wrong in the stacking of elements
# revert to the backup tree
self.debug.print_debug(self, u'Reverting to backup tree as size classification failed')
tree = etree.fromstring(backup_tree)
manipulate.save_tree(tree)
# re-nest headings where a single heading and nothing else is found within a section
backup_tree = etree.tostring(tree, encoding="unicode")
tree = self.renest_headings(manipulate, tree)
if not self.verify_headings(stack, tree):
# something went very wrong in the stacking of elements
# revert to the backup tree
self.debug.print_debug(self, u'Reverting to backup tree as size classification failed')
tree = etree.fromstring(backup_tree)
manipulate.save_tree(tree)
self.final_headings(manipulate, tree)