Source code for bin.meTypeset

#!/usr/bin/env python
"""meTypeset: text parsing library to convert word documents to the JATS XML format

Usage:
    meTypeset.py doc <input> <output_folder> [options]
    meTypeset.py docx <input> <output_folder> [options]
    meTypeset.py docxextracted <input> <output_folder> [options]
    meTypeset.py odt <input> <output_folder> [options]
    meTypeset.py other <input> <output_folder> [options]
    meTypeset.py tei <input> <output_folder> [options]
    meTypeset.py bibscan <input> [options]

Options:
    -a, --aggression <aggression_level>             Parser aggression level 0-10 [default: 10]
    --chain <xslt>                                  Specify a subsequent XSL transform to pass the NLM to
    -c, --clean                                     Produce final XML, not intermediate markup with additional metadata
    -d, --debug                                     Enable debug output
    -i, --identifiers                               Generate unique identifiers for all supported NLM elements
    --includedeleted                                Keep deleted text (track changes)
    --interactive                                   Enable step-by-step interactive mode
    -h, --help                                      Show this screen.
    -m, --metadata <metadata_file>                  Metadata file
    --nogit                                         Disable git debug filesystem (only of use with --debug)
    --noimageprocessing                             Disable unoconv image processing
    --nolink                                        Do not run reference linker
    --nometa                                        Do not merge front matter
    --purenlm                                       Die after performing NLM XSLT step
    --puretei                                       Die after performing TEI XSLT step
    --prettytei                                     Indent and format intermediary TEI
    -p, --proprietary                               Enables proprietary math parsing. Requires omml2mml.xsl
    -s, --settings <settings_file>                  Settings file
    -v, --version                                   Show version.
    -z, --zotero                                    Enable Zotero integration for references.

"""

[docs]__author__ = "Martin Paul Eve, Dulip Withnage"
[docs]__email__ = "martin@martineve.com"

from docxtotei import *
from teitonlm import *
from sizeclassifier import *
from frontmatterparser import *
from docopt import docopt
from teimanipulate import TeiManipulate
from globals import *
from debug import Debuggable
from bibliographyaddins import BibliographyAddins
from bibliographydatabase import BibliographyDatabase
from bibliographyclassifier import BibliographyClassifier
from listclassifier import ListClassifier
from metadata import Metadata
from referencelinker import ReferenceLinker
from xslchainer import XslChain
from settingsconfiguration import Settings
from idgenerator import IdGenerator
from captionclassifier import CaptionClassifier
from complianceenforcer import ComplianceEnforcer
from interactive import Interactive
from unoconvtodocx import UnoconvToDocx


# check whether lxml is installed
try:
    # noinspection PyUnresolvedReferences
    from lxml import etree
except ImportError:
    print("Failed to import lxml")


[docs]class MeTypeset (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'Main')

        if self.args['--debug']:
            self.debug.enable_debug(self.args['--nogit'])

        # read settings file
        self.settings_file_path = 'default'
        self.tei_file_path = None
        self.settings_file_path = Settings.setup_settings_file(self.args)
        self.settings = Settings(Settings.get_settings_file(self, self.settings_file_path), self.args)
        self.gv = GV(self.settings, self.debug)

        self.debug.enable_prompt(Interactive(self.gv))

    @staticmethod
[docs]    def read_command_line():
        return docopt(__doc__, version='meTypeset 0.1')

[docs]    def set_metadata_file(self):
        metadata_file_arg = self.settings.args['--metadata']
        if metadata_file_arg:
            metadata_file = self.gv.settings.clean_path(self.gv.settings.concat_path(self.settings.script_dir,
                                                                            metadata_file_arg[0]))
        else:
            metadata_file = \
                self.gv.settings.clean_path(
                    self.gv.settings.concat_path(self.settings.script_dir,
                                                 self.gv.settings.get_setting('default-metadata-file-path',
                                                                                self)))

            self.debug.print_debug(self, u'Metadata file wasn\'t specified. '
                                         'Falling back to {0}'.format(metadata_file))

        return metadata_file

[docs]    def run_modules(self):
        ag = int(self.gv.settings.args['--aggression'])
        self.debug.print_debug(self,
                               u'Running at aggression level {0} {1}'.format(ag,
                                                                             "[grrr!]" if ag == 10 else ""))

        if ag > 10:
            self.debug.print_debug(self, "WARNING: safety bail-out features are disabled at aggression level 11")

        if self.args['bibscan']:

            BibliographyDatabase(self.gv).scan()
        else:
            # check for stylesheets
            self.gv.check_file_exists(self.gv.docx_style_sheet_dir)
            # metadata file
            gv.metadata_file = self.set_metadata_file()

            self.gv.mk_dir(self.gv.output_folder_path)

            if self.args['doc']:
                # run doc to docx conversion
                # then run docx to tei
                UnoconvToDocx(self.gv).run('doc')
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['odt']:
                # run odt to docx conversion
                # then run docx to tei
                UnoconvToDocx(self.gv).run('odt')
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['other']:
                # run other unoconv-supported format to docx conversion
                # then run docx to tei
                UnoconvToDocx(self.gv).run('unoconv')
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['docx']:
                # run docx to tei conversion
                # includes hooks for proprietary transforms if enabled
                DocxToTei(self.gv).run(True, self.args['--proprietary'])
            elif self.args['docxextracted']:
                self.debug.print_debug(self, u'Skipping docx extraction')
                DocxToTei(self.gv).run(False, self.args['--proprietary'])
            elif self.args['tei']:
                self.debug.print_debug(self, u'Skipping docx extraction; processing TEI file')
                DocxToTei(self.gv).run(False, self.args['--proprietary'], tei=True)

            if self.args['--puretei']:
                self.debug.print_debug(self, u'Exiting as TEI transform complete')
                return

            metadata = Metadata(self.gv)
            metadata.pre_clean()

            # run size classifier
            # aggression 5
            SizeClassifier(self.gv).run()

            # run bibliographic addins handler
            # aggression 4
            found_bibliography = BibliographyAddins(self.gv).run()

            # run list classifier
            # aggression 4
            ListClassifier(self.gv).run()

            bibliography_classifier = BibliographyClassifier(self.gv)

            if not found_bibliography:
                # run bibliographic classifier
                # aggression 4
                bibliography_classifier.run()

            # tei
            # aggression 3
            TeiManipulate(self.gv).run()

            # run tei to nlm conversion
            TeiToNlm(self.gv).run(not found_bibliography)

            if self.gv.settings.args['--purenlm']:
                self.debug.print_debug(self, u'Exiting as NLM transform complete')
                return

            manipulate = NlmManipulate(self.gv)

            if not self.gv.used_list_method:
                manipulate.fuse_references()

            # run reference linker
            if not (self.args['--nolink']):
                rl = ReferenceLinker(self.gv)
                rl.run(self.args['--interactive'])
                rl.cleanup()

            # run table classifier
            cc = CaptionClassifier(self.gv)
            if int(self.args['--aggression']) > int(self.gv.settings.get_setting('tablecaptions',
                                                                                 self, domain='aggression')):
                cc.run_tables()

            if int(self.args['--aggression']) > int(self.gv.settings.get_setting('graphiccaptions',
                                                                                 self, domain='aggression')):
                cc.run_graphics()

            cc.run_ext_link_compliance()
            manipulate.double_p_compliance()

            # run metadata merge
            if not (self.args['--nometa']):
                metadata.run()

            if self.args['--interactive']:
                bibliography_classifier.run_prompt(True)

            # process any bibliography entries that are possible
            BibliographyDatabase(self.gv).run()

            # remove stranded titles and cleanup
            manipulate.final_clean()

            if self.args['--identifiers']:
                IdGenerator(self.gv).run()

            if self.args['--chain']:
                # construct and run an XSLT chainer
                XslChain(self.gv).run()

            if self.args['--clean']:
                ComplianceEnforcer(self.gv).run()

[docs]    def run(self):
        self.run_modules()

        if not self.debug:
            os.remove(self.gv.nlm_temp_file_path)


[docs]def main():
    me_typeset_instance = MeTypeset()
    me_typeset_instance.run()


if __name__ == '__main__':
    main()