Source code for bin.meTypeset
#!/usr/bin/env python
"""meTypeset: text parsing library to convert word documents to the JATS XML format
Usage:
meTypeset.py doc <input> <output_folder> [options]
meTypeset.py docx <input> <output_folder> [options]
meTypeset.py docxextracted <input> <output_folder> [options]
meTypeset.py odt <input> <output_folder> [options]
meTypeset.py other <input> <output_folder> [options]
meTypeset.py tei <input> <output_folder> [options]
meTypeset.py bibscan <input> [options]
Options:
-a, --aggression <aggression_level> Parser aggression level 0-10 [default: 10]
--chain <xslt> Specify a subsequent XSL transform to pass the NLM to
-c, --clean Produce final XML, not intermediate markup with additional metadata
-d, --debug Enable debug output
-i, --identifiers Generate unique identifiers for all supported NLM elements
--includedeleted Keep deleted text (track changes)
--interactive Enable step-by-step interactive mode
-h, --help Show this screen.
-m, --metadata <metadata_file> Metadata file
--nogit Disable git debug filesystem (only of use with --debug)
--noimageprocessing Disable unoconv image processing
--nolink Do not run reference linker
--nometa Do not merge front matter
--purenlm Die after performing NLM XSLT step
--puretei Die after performing TEI XSLT step
--prettytei Indent and format intermediary TEI
-p, --proprietary Enables proprietary math parsing. Requires omml2mml.xsl
-s, --settings <settings_file> Settings file
-v, --version Show version.
-z, --zotero Enable Zotero integration for references.
"""
[docs]__author__ = "Martin Paul Eve, Dulip Withnage"
[docs]__email__ = "martin@martineve.com"
from docxtotei import *
from teitonlm import *
from sizeclassifier import *
from frontmatterparser import *
from docopt import docopt
from teimanipulate import TeiManipulate
from globals import *
from debug import Debuggable
from bibliographyaddins import BibliographyAddins
from bibliographydatabase import BibliographyDatabase
from bibliographyclassifier import BibliographyClassifier
from listclassifier import ListClassifier
from metadata import Metadata
from referencelinker import ReferenceLinker
from xslchainer import XslChain
from settingsconfiguration import Settings
from idgenerator import IdGenerator
from captionclassifier import CaptionClassifier
from complianceenforcer import ComplianceEnforcer
from interactive import Interactive
from unoconvtodocx import UnoconvToDocx
# check whether lxml is installed
try:
# noinspection PyUnresolvedReferences
from lxml import etree
except ImportError:
print("Failed to import lxml")
[docs]class MeTypeset (Debuggable):
def __init__(self):
# read command line arguments
self.args = self.read_command_line()
# absolute first priority is to initialize debugger so that anything triggered here can be logged
self.debug = Debug()
Debuggable.__init__(self, 'Main')
if self.args['--debug']:
self.debug.enable_debug(self.args['--nogit'])
# read settings file
self.settings_file_path = 'default'
self.tei_file_path = None
self.settings_file_path = Settings.setup_settings_file(self.args)
self.settings = Settings(Settings.get_settings_file(self, self.settings_file_path), self.args)
self.gv = GV(self.settings, self.debug)
self.debug.enable_prompt(Interactive(self.gv))
@staticmethod
[docs] def read_command_line():
return docopt(__doc__, version='meTypeset 0.1')
[docs] def run_modules(self):
ag = int(self.gv.settings.args['--aggression'])
self.debug.print_debug(self,
u'Running at aggression level {0} {1}'.format(ag,
"[grrr!]" if ag == 10 else ""))
if ag > 10:
self.debug.print_debug(self, "WARNING: safety bail-out features are disabled at aggression level 11")
if self.args['bibscan']:
BibliographyDatabase(self.gv).scan()
else:
# check for stylesheets
self.gv.check_file_exists(self.gv.docx_style_sheet_dir)
# metadata file
gv.metadata_file = self.set_metadata_file()
self.gv.mk_dir(self.gv.output_folder_path)
if self.args['doc']:
# run doc to docx conversion
# then run docx to tei
UnoconvToDocx(self.gv).run('doc')
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['odt']:
# run odt to docx conversion
# then run docx to tei
UnoconvToDocx(self.gv).run('odt')
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['other']:
# run other unoconv-supported format to docx conversion
# then run docx to tei
UnoconvToDocx(self.gv).run('unoconv')
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['docx']:
# run docx to tei conversion
# includes hooks for proprietary transforms if enabled
DocxToTei(self.gv).run(True, self.args['--proprietary'])
elif self.args['docxextracted']:
self.debug.print_debug(self, u'Skipping docx extraction')
DocxToTei(self.gv).run(False, self.args['--proprietary'])
elif self.args['tei']:
self.debug.print_debug(self, u'Skipping docx extraction; processing TEI file')
DocxToTei(self.gv).run(False, self.args['--proprietary'], tei=True)
if self.args['--puretei']:
self.debug.print_debug(self, u'Exiting as TEI transform complete')
return
metadata = Metadata(self.gv)
metadata.pre_clean()
# run size classifier
# aggression 5
SizeClassifier(self.gv).run()
# run bibliographic addins handler
# aggression 4
found_bibliography = BibliographyAddins(self.gv).run()
# run list classifier
# aggression 4
ListClassifier(self.gv).run()
bibliography_classifier = BibliographyClassifier(self.gv)
if not found_bibliography:
# run bibliographic classifier
# aggression 4
bibliography_classifier.run()
# tei
# aggression 3
TeiManipulate(self.gv).run()
# run tei to nlm conversion
TeiToNlm(self.gv).run(not found_bibliography)
if self.gv.settings.args['--purenlm']:
self.debug.print_debug(self, u'Exiting as NLM transform complete')
return
manipulate = NlmManipulate(self.gv)
if not self.gv.used_list_method:
manipulate.fuse_references()
# run reference linker
if not (self.args['--nolink']):
rl = ReferenceLinker(self.gv)
rl.run(self.args['--interactive'])
rl.cleanup()
# run table classifier
cc = CaptionClassifier(self.gv)
if int(self.args['--aggression']) > int(self.gv.settings.get_setting('tablecaptions',
self, domain='aggression')):
cc.run_tables()
if int(self.args['--aggression']) > int(self.gv.settings.get_setting('graphiccaptions',
self, domain='aggression')):
cc.run_graphics()
cc.run_ext_link_compliance()
manipulate.double_p_compliance()
# run metadata merge
if not (self.args['--nometa']):
metadata.run()
if self.args['--interactive']:
bibliography_classifier.run_prompt(True)
# process any bibliography entries that are possible
BibliographyDatabase(self.gv).run()
# remove stranded titles and cleanup
manipulate.final_clean()
if self.args['--identifiers']:
IdGenerator(self.gv).run()
if self.args['--chain']:
# construct and run an XSLT chainer
XslChain(self.gv).run()
if self.args['--clean']:
ComplianceEnforcer(self.gv).run()
[docs] def run(self):
self.run_modules()
if not self.debug:
os.remove(self.gv.nlm_temp_file_path)
[docs]def main():
me_typeset_instance = MeTypeset()
me_typeset_instance.run()
if __name__ == '__main__':
main()