Source code for bin.nlmmanipulate

[docs]__author__ = "Martin Paul Eve"
[docs]__email__ = "martin@martineve.com"

from manipulate import Manipulate
from lxml import etree
import re
import uuid


[docs]class NlmManipulate(Manipulate):
    def __init__(self, gv):
        self.gv = gv
        self.debug = self.gv.debug
        self.dom_to_load = self.gv.nlm_file_path
        self.dom_temp_file = self.gv.nlm_temp_file_path
        self.namespaces = {'jats':'http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd',
                          'xmlns:xlink':'http://www.w3.org/1999/xlink'}
        self.mod_name = 'NLM'
        Manipulate.__init__(self, gv)

[docs]    def remove_reference_numbering(self):
        tree = self.load_dom_tree()

        for ref in tree.xpath('//ref'):
            if hasattr(ref, 'text') and ref.text is not None:
                text = ref.text

                ref.text = re.sub(r'^\d\d*\s*\.?\s*', r'', text)

        self.save_tree(tree)

[docs]    def remove_empty_elements(self, element):
        tree = self.load_dom_tree()

        for paragraph in tree.xpath(element):
            found = False
            text = self.get_stripped_text(paragraph).strip()

            if text == '':
                for item in paragraph:
                    if self.get_stripped_text(item) != '' or item.tag == 'graphic':
                        found = True
                        break
            else:
                found = True

            if not found and (paragraph.tail is None or paragraph.tail == ''):
                paragraph.getparent().remove(paragraph)
                self.save_tree(tree)
                self.debug.print_debug(self, u'Removed an empty element')
            elif not found and paragraph.tail != '':
                sibling = paragraph.getprevious()

                if sibling is None:
                    if paragraph.getparent().text is not None:
                        paragraph.getparent().text += paragraph.tail
                    else:
                        paragraph.getparent().text = paragraph.tail
                else:
                    sibling.tail = paragraph.tail

                paragraph.getparent().remove(paragraph)
                self.save_tree(tree)
                self.debug.print_debug(self, u'Removed an empty element but preserved tail')

        self.save_tree(tree)

[docs]    def double_p_compliance(self):
        self.debug.print_debug(self, u'Attempting to correct any mis-nested paragraph elements')

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()
        bad_ps = tree.xpath('//p/p')

        for p in bad_ps:
            p_parent = p.getparent()
            parent = p_parent.getparent()
            parent.insert(parent.index(p_parent)+1, p)

        self.save_tree(tree)


    @staticmethod
[docs]    def handle_nested_elements(iter_node, move_node, node, node_parent, outer_node, tag_name, tail_stack,
                               tail_stack_objects):
        if iter_node is None:
            return None, None, None

        while iter_node.tag != tag_name:
            tail_stack.append(iter_node.tag)
            tail_stack_objects.append(iter_node)
            iter_node = iter_node.getparent()
            if iter_node is None:
                return None, None, None

        # get the tail (of the comment) and style it
        append_location = None
        tail_text = node.tail
        iterator = 0
        tail_stack.reverse()
        tail_stack_objects.reverse()
        # rebuild the styled tree on a set of subelements
        for node_to_add in tail_stack:
            sub_element = etree.Element(node_to_add)
            if iterator == len(tail_stack) - 1:
                sub_element.text = node.tail

            if iterator == 0:
                outer_node = sub_element

            iterator += 1
            if append_location is None:
                tail_stack_objects[0].addnext(sub_element)
                append_location = sub_element
            else:
                Manipulate.append_safe(append_location, sub_element, None)
                append_location = sub_element

        # remove the old node (this is in the element above)
        node.getparent().remove(node)
        # set the search node to the outermost node so that we can find siblings
        node_parent = iter_node
        node = outer_node
        move_node = True
        return move_node, node, node_parent

    @staticmethod
[docs]    def search_and_copy(last_node, move_node, nested_sibling, new_element, node, node_parent):
        append_location = new_element
        new_nodes_to_copy = node.xpath('following-sibling::node()')
        last_append = None
        for new_node in new_nodes_to_copy:
            if type(new_node) is etree._ElementStringResult or type(new_node) is etree._ElementUnicodeResult:
                if last_append is None:
                    last_append = append_location
                    if move_node:
                        node.tail = new_node
                        Manipulate.append_safe(last_append, node, None)
                    else:
                        last_append.text = new_node
                else:
                    last_append.tail = new_node
            else:
                Manipulate.append_safe(append_location, new_node, None)
                last_append = new_node
        if nested_sibling is None:
            node_parent.addnext(new_element)
            node_parent.tail = ''
            nested_sibling = new_element
        else:
            nested_sibling.addnext(new_element)
            nested_sibling = new_element

        # remove the original tail (all text after the line break, for example)
        # <!--meTypeset:br-->A third break <-- "a third break" is the tail
        if not move_node:
            node.tail = ''
            last_node = new_element
        else:
            last_node = None
        return last_node

[docs]    def process_node_for_tags(self, nested_sibling, node, search_xpath, tag_name, new_tag='SAME'):
        if new_tag == 'SAME':
            new_tag = tag_name

        last_node = node
        new_element = etree.Element(new_tag)
        new_element.text = ''
        nodes_to_copy = node.xpath('//{0}/following-sibling::node()'.format(search_xpath))

        if len(nodes_to_copy) == 0:
            return

        self.debug.print_debug(self, u'Found {0} nodes to copy: {1}'.format(len(nodes_to_copy),
                                                                           nodes_to_copy))
        #for element in nodes_to_copy:
        element = nodes_to_copy[0]
        # noinspection PyProtectedMember
        if not type(element) is etree._Element:
            if node.tail == element:
                # this should handle cases where a tag spans the break
                # for example: <p>Some text <italic>some italic text<!--meTypeset:br-->
                # more italic text</italic> more text</p>
                node_parent = node.getparent()
                iter_node = node_parent
                tail_stack = []
                tail_stack_objects = []
                move_node = False
                outer_node = None

                if node_parent.tag != tag_name:
                    # the element here is nested (bold etc), so we need to move the tail to be the tail of the
                    # outermost element and change "node" to be correct for its siblings to be the rest of
                    # the element. So we might find the last part is in italics and bold, so we build a list and
                    # iterate over it within the new copied element
                    move_node, node, node_parent = self.handle_nested_elements(iter_node, move_node, node,
                                                                               node_parent, outer_node, tag_name,
                                                                               tail_stack, tail_stack_objects)

                # search for all siblings and copy them into a new element below
                last_node = self.search_and_copy(last_node, move_node, nested_sibling, new_element, node,
                                                 node_parent)

            else:
                new_element.tail = node.tail
                node.tail = ''
        else:
            Manipulate.append_safe(new_element, element, self)
        if not last_node is None:
            last_node.addnext(new_element)
            node.getparent().remove(node)

    @staticmethod
[docs]    def add_error_tag(node, error_number):
        rend = 'error-{0}'.format(error_number)

        if u'rend' in node.attrib:
            if not rend in node.attrib['rend']:
                # append the new value
                rend = u'{0} {1}'.format(node.attrib[u'rend'], rend)
            else:
                # just re-write the old value
                rend = node.attrib[u'rend']

        node.attrib[u'rend'] = rend

[docs]    def close_and_open_tag_not_styled(self, search_xpath, tag_name):
        """
        Opens and closes an XML tag within a document. This is primarily useful when we have a marker such as
        meTypeset:br in comments which corresponds to no JATS/NLM equivalent. We use this function in certain
        behavioural modes to close the preceding paragraph and open the next.

        This variant only performs this action when the subsequent text does not look like a heading.

        @param search_xpath: the node that serves as a marker
        @param tag_name: the tag name that will be open and closed
        """
        tree = self.load_dom_tree()

        initial_nodes = tree.xpath('//{0}//{1}'.format(tag_name, search_xpath))
        self.debug.print_debug(self, u'Found {0} {1} nodes on which to close and open tag {2}'.format(
            len(initial_nodes), search_xpath, tag_name))

        nested_sibling = None
        bail = False

        if len(initial_nodes) > 80 and int(self.gv.settings.args["--aggression"]) < 11:
            self.debug.print_debug(self, u'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath))
            self.debug.write_error(self,
                                   'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath),
                                   '001')
            bail = True

        if not bail:
            for node in initial_nodes:
                sibling = node

                while sibling.getnext() is not None:
                    try:
                        if sibling.tag.endswith('bold'):
                            bail = True
                    except:
                        bail = True
                        break

                if not bail:
                    self.process_node_for_tags(nested_sibling, node, search_xpath, tag_name, 'p')
        else:
            # add an error tag to p elements where there are more than 3 comments within
            children = tree.xpath('//*[count(comment()[.="meTypeset:br"]) > 3]'.format(search_xpath))

            for child in children:
                self.add_error_tag(child, u'001')

        self.save_tree(tree)

[docs]    def close_and_open_tag(self, search_xpath, tag_name):
        """
        Opens and closes an XML tag within a document. This is primarily useful when we have a marker such as
        meTypeset:br in comments which corresponds to no JATS/NLM equivalent. We use this function in certain
        behavioural modes to close the preceding paragraph and open the next.

        @param search_xpath: the node that serves as a marker
        @param tag_name: the tag name that will be open and closed
        """
        tree = self.load_dom_tree()

        initial_nodes = tree.xpath('//{0}//{1}'.format(tag_name, search_xpath))
        self.debug.print_debug(self, u'Found {0} {1} nodes on which to close and open tag {2}'.format(
            len(initial_nodes), search_xpath, tag_name))

        nested_sibling = None
        bail = False

        if len(initial_nodes) > 80 and int(self.gv.settings.args["--aggression"]) < 11:
            self.debug.print_debug(self, u'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath))
            self.debug.write_error(self,
                                   'Bailing from replacement of tag {0} [limit exceeded]'.format(search_xpath),
                                   '001')
            bail = True

        if not bail:
            for node in initial_nodes:
                if not bail:
                    self.process_node_for_tags(nested_sibling, node, search_xpath, tag_name)
        else:
            # add an error tag to p elements where there are more than 3 comments within
            children = tree.xpath('//*[count(comment()[.="meTypeset:br"]) > 3]'.format(search_xpath))

            for child in children:
                self.add_error_tag(child, u'001')

        self.save_tree(tree)

[docs]    def save_tree(self, tree):
        tree.write(self.dom_temp_file, pretty_print=True)
        tree.write(self.dom_to_load, pretty_print=True)

[docs]    def find_text(self, paragraph, text):
        if paragraph.text and text in paragraph.text:
            return paragraph, False

        if paragraph.tail and text in paragraph.tail:
            return paragraph, True

        for sub_element in paragraph:
            ret, tail = self.find_text(sub_element, text)

            if ret is not None:
                return ret, tail

        return None, False

[docs]    def insert_break(self, search_xpath, tag_name):
        """
        Opens and closes an XML tag within a document. This is primarily useful when we have a marker such as
        meTypeset:br in comments which corresponds to no JATS/NLM equivalent. We use this function in certain
        behavioural modes to close the preceding paragraph and open the next.

        @param search_xpath: the node that serves as a marker
        @param tag_name: the tag name that will be open and closed
        """
        tree = self.load_dom_tree()

        initial_nodes = tree.xpath('//{0}//{1}'.format(tag_name,search_xpath))
        self.debug.print_debug(self, u'Found {0} {1} nodes on which to insert break: {2}'.format(
            len(initial_nodes), search_xpath, tag_name))

        for node in initial_nodes:
            break_element = etree.Element('break')
            node.addnext(break_element)
            node.getparent().remove(node)

        self.save_tree(tree)

[docs]    def reflist_indent_method(self, tree):
        # tag the last item as a reference list
        indentmethod = tree.xpath('(//sec[title][disp-quote] | //sec[title][list])[last()]')
        if indentmethod:
            for item in indentmethod:
                item.attrib['reflist'] = 'yes'

[docs]    def reflist_year_match_method(self, tree, root, tolerance):
        sections = tree.xpath(root)

        # work upwards as the last section is most likely to contain references
        for element in reversed(sections):
            found_other = False
            count = 0
            use_tag = None
            diff_count = 0

            for p in element:
                # use either p or disp-quote, but not a mix
                if use_tag is None:
                    if p.tag == 'p' or p.tag == 'disp-quote' or p.tag == 'list-item':
                        use_tag = p.tag

                if p.tag == use_tag:
                    for sub_element in p:
                        if sub_element.tag == 'p':
                            p = sub_element
                            break

                    text = self.get_stripped_text(p)

                    year_test = re.compile('((1|2)\d{3}[a-z]?)|(n\.d\.)')

                    match = year_test.findall(text)

                    if not match:
                        blank_text = re.compile('XXXX')
                        match_inner = blank_text.findall(text)
                        if not match_inner:
                            diff_count += 1

                            if diff_count > tolerance:
                                self.debug.print_debug(self, u'Too many different non-year matches found in this'
                                                             u' {1} section to classify as a reference block. '
                                                             u'(Allowed: {0})'.format(tolerance, root))
                                found_other = True
                                break
                        elif len(match_inner) == 1:
                            count += 1
                            p.attrib['rend'] = 'ref'
                        else:
                            page_test = re.compile('(((18|19|20)\d{2})\-((18|19|20)\d{2}))')
                            is_page_range = page_test.search(text)

                            if not is_page_range:
                                self.debug.print_debug(self, u'More than one year match found in this {0}'.format(root))
                                found_other = True
                                break
                    elif len(match) == 1:
                        # only do this if we find 1 match on the line; otherwise, it's a problem
                        count += 1
                        p.attrib['rend'] = 'ref'
                    else:
                        page_test = re.compile('(((18|19|20)\d{2})\-((18|19|20)\d{2}))')
                        is_page_range = page_test.search(text)

                        if not is_page_range:
                            self.debug.print_debug(self, u'More than one year match found in this {0}'.format(root))
                            found_other = True
                            break

                elif p.tag != 'title' and not use_tag is None:
                    # found a tag other than the one we want or 'title'
                    diff_count += 1

                    if diff_count > tolerance:
                        self.debug.print_debug(self, u'Too many different elements found in this {1} section to '
                                                     u'classify as a reference block. (Allowed: {0})'.format(tolerance,
                                                                                                             root))
                        found_other = True
                        break

            if count > 1 and not found_other:
                self.debug.print_debug(self, u'Found a reference list in a {0} block with '
                                             u'tolerance {1}'.format(root, tolerance))
                while element.tag != 'sec':
                    element = element.getparent()

                element.attrib['reflist'] = 'yes'
                return True
            else:
                for p in element:
                    if 'rend' in p.attrib:
                        del p.attrib['rend']

        return False

[docs]    def find_or_create_element(self, tree, element_tag, add_xpath, is_sibling):
        # find_or_create_elements(tree, 'back', '//body', true)
        ret = None
        try:
            ret = tree.xpath(u'//' + element_tag, namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})[0]
            self.debug.print_debug(self, u'Found existing {0}. Using it.'.format(element_tag))
        except:
            self.debug.print_debug(self, u'Unable to find an existing {0} element.'.format(element_tag))

        if ret is None:
            self.debug.print_debug(self, u'Creating new {0} element.'.format(element_tag))
            ret = tree.xpath(add_xpath)[0]
            new_element = etree.Element(element_tag)

            if is_sibling:
                ret.addnext(new_element)
            else:
                Manipulate.append_safe(ret, new_element, self)

            ret = new_element

        return ret

[docs]    def delete_special_lines(self):
        tree = self.load_dom_tree()

        special_regex = re.compile('^[\-\.\,\+\#\'\;\:]+$')

        to_remove = []

        for ref in tree.xpath('//p'):
            text = self.get_stripped_text(ref)

            if special_regex.match(text):
                ref.getparent().remove(ref)
                self.save_tree(tree)
                self.debug.print_debug(self, u'Removing special character line: {0}'.format(text))

[docs]    def clean_refs(self):
        tree = self.load_dom_tree()

        ref_regex = re.compile('^(?P<prelim>\s*\d+[\.\,]?\s+)(?P<reference>.+)')

        for ref in tree.xpath('//back/ref-list/ref'):
            if ref.text and ref_regex.match(ref.text):
                ref.text = ref_regex.sub('\\g<reference>', ref.text)
                self.save_tree(tree)
                self.debug.print_debug(self,
                                       u'Removing number/whitespace from start of reference: {0}'.format(ref.text))

        for ref in tree.xpath('//back/ref-list/ref[not(element-citation)]'):
            new_ref = etree.Element('ref')
            ref.addnext(new_ref)

            ref.tag = 'mixed-citation'
            new_ref.append(ref)

            if 'id' in ref.attrib:
                new_ref.attrib['id'] = ref.attrib['id']
                del ref.attrib['id']

        self.save_tree(tree)
        self.debug.print_debug(self, u'Encapsulated any loose refs inside mixed-citation blocks')

[docs]    def final_clean(self):
        self.delete_special_lines()
        self.handle_stranded_reference_titles_from_cues()
        self.clean_refs()
        self.remove_empty_elements('//fn-group')
        self.remove_empty_elements('//p')
        self.remove_empty_elements('//ref-list')

[docs]    def find_reference_list(self):
        if self.gv.used_list_method or self.gv.used_square_reference_method:
            return

        tree = self.load_dom_tree()

        # look for sections where very paragraph contains a year; likely to be a reference
        tags = ['//sec', '//sec/list']

        found = False

        for tag in tags:
            found = self.reflist_year_match_method(tree, tag, 0)

            if not found:
                found = self.reflist_year_match_method(tree, tag, 1)

            if not found:
                found = self.reflist_year_match_method(tree, tag, 2)

            if not found:
                found = self.reflist_year_match_method(tree, tag, 3)

        self.save_tree(tree)

[docs]    def handle_stranded_reference_titles_from_cues(self):
        # this method looks for paragraphs with one title element and nothing else whose text is in our
        # linguistic cues documents. It then removed them as superfluous.

        self.debug.print_debug(self, u'Checking for any stranded titles as a result of reference parsing')

        tree = self.load_dom_tree()

        xpath = '//sec[(count(p) = 0) and (count(title) = 1)]'

        language_list = self.gv.settings.get_setting('reference-languages', self).split(',')

        reference_terms = []

        for language in language_list:
            with open ('{0}/language/ref_marker_{1}.txt'.format(self.gv.script_dir, language), 'r', encoding="utf8") as lang_file:
                lines = lang_file.read().split('\n')

                for line in lines:
                    reference_terms.append(line.lower())

        for sections in tree.xpath(xpath):
            process = True
            for item in sections:
                if item.tag != 'title':
                    process = False

            if process:
                for item in sections:
                    text = self.get_stripped_text(item).strip()

                    if text.lower() in reference_terms:
                        sections.getparent().remove(sections)
                        self.save_tree(tree)
                        self.debug.print_debug(self, u'Removed a stranded title: {0}'.format(text))

[docs]    def fuse_references(self):
        tree = self.load_dom_tree()

        for ref in tree.xpath('//back/ref-list/ref'):
            text = self.get_stripped_text(ref)

            year_test = re.compile('((1|2)\d{3}[a-z]?)|(n\.d\.)')
            match = year_test.findall(text)

            if not match and ref.getprevious() is not None:
                ref.tag = 'REMOVE'
                ref.getprevious().append(ref)

                etree.strip_tags(tree, 'REMOVE')

                self.save_tree(tree)
                self.debug.print_debug(self, u'Appending {0} to previous ref'.format(text))

[docs]    def tag_bibliography_refs(self):

        tree = self.load_dom_tree()

        existing_refs = tree.xpath('//back/ref-list')

        if len(existing_refs) > 0:
            return

        self.find_or_create_element(tree, 'back', '//body', True)
        ref_list = self.find_or_create_element(tree, 'ref-list', '//back', False)

        # change this to find <reflist> elements after we're more certain of how to identify them
        for refs in tree.xpath('//sec[@reflist="yes"]/p[@rend="ref"] | //sec[@reflist="yes"]/title '
                               '| //sec[@reflist="yes"]/*/listitem/p[@rend="ref"] | '
                               '//sec[@reflist="yes"]/*/p[@rend="ref"]'):

            if refs.tag == 'title':
                self.debug.print_debug(self, u'Removing title element from reference item')
                refs.getparent().remove(refs)
            else:
                self.debug.print_debug(self, u'Tagging element "{0}" as reference item'.format(refs.tag))
                refs.tag = 'ref'
                refs.attrib['id'] = u'ID{0}'.format(uuid.uuid4())

                if 'rend' in refs.attrib:
                        del refs.attrib['rend']

                Manipulate.append_safe(ref_list, refs, self)

        self.save_tree(tree)