Source code for bin.docxtotei

#!/usr/bin/env python

import distutils
import os
import errno
import shutil
import zipfile
import subprocess
import re
import globals as gv

[docs]__author__ = "Dulip Withanage"
[docs]__email__ = "dulip.withanage@gmail.com"
from debug import Debuggable from teimanipulate import TeiManipulate from lxml import etree
[docs]class DocxToTei(Debuggable): def __init__(self, gv): self.gv = gv self.debug = gv.debug Debuggable.__init__(self, 'DOCX to TEI')
[docs] def saxon_doc_to_tei(self): """ Creates the appropriate java command to run Saxon @return: a string to run on the command line """ cmd = ["java", "-classpath", self.gv.java_class_path, "-Dxml.catalog.files="+self.gv.runtime_catalog_path, "net.sf.saxon.Transform", "-x", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-y", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-r", "org.apache.xml.resolver.tools.CatalogResolver", "-o", self.gv.settings.clean_path(self.gv.tei_file_path), self.gv.word_document_xml, self.gv.docx_to_tei_stylesheet ] return ' '.join(cmd)
[docs] def saxon_omml_to_mml(self): """ Creates the appropriate java command to run Saxon @return: a string to run on the command line """ cmd = ["java", "-classpath", self.gv.java_class_path, "-Dxml.catalog.files="+self.gv.runtime_catalog_path, "net.sf.saxon.Transform", "-x", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-y", "org.apache.xml.resolver.tools.ResolvingXMLReader", "-r", "org.apache.xml.resolver.tools.CatalogResolver", "-o", self.gv.word_document_xml, self.gv.word_document_xml, self.gv.proprietary_style_sheet ] return ' '.join(cmd)
[docs] def handle_wmf(self): """ Calls unoconv to convert wmf images into png format. This method has a hard limit of 30 images. @return: False if fails (more than 30 images), True otherwise """ image_filenames = os.listdir(self.gv.output_media_path) if len(image_filenames) > 30: self.debug.print_debug(self, u'Abandoning image conversion as there are over thirty images (DoS mitigation)') return False for image in image_filenames: if re.match(r'.+?\.(w|e)mf', image) is not None: image_name = re.sub(r'\.(w|e)mf', '', image) imagemagick_command = '{3}*DELIMITER*-d*DELIMITER*graphics*DELIMITER*-f*DELIMITER*png*DELIMITER*-o' \ '*DELIMITER*{0}/{1}.png*DELIMITER*' \ '{0}/{2}'.format(self.gv.output_media_path, image_name, image, self.gv.settings.get_setting('unoconv', self)) self.debug.print_debug(self, u'Calling: {0}'.format(imagemagick_command.replace('*DELIMITER*', ' '))) subprocess.call(imagemagick_command.split('*DELIMITER*')) return True
[docs] def clean_proprietary(self): p = etree.XMLParser(remove_blank_text=True, resolve_entities=False) tree = etree.parse(self.gv.word_document_xml, p) omml = tree.xpath('//m:oMath', namespaces={'m': 'http://schemas.openxmlformats.org/officeDocument/2006/math'}) for omml_paragraph in omml: omml_paragraph.tag = '{http://www.w3.org/1998/Math/MathML}math' etree.strip_tags(tree, '{http://schemas.openxmlformats.org/officeDocument/2006/math}oMathPara') omml = tree.xpath('//m:oMathParaPr', namespaces={'m': 'http://schemas.openxmlformats.org/officeDocument/2006/math'}) for omml_paragraph in omml: omml_paragraph.getparent().remove(omml_paragraph) tree.write(self.gv.word_document_xml)
[docs] def run(self, extract, run_proprietary, tei=False): """ This method converts from docx to TEI. It creates the necessary output folders, optionally extracts the file and runs the Saxon process necessary to conduct the transform @param extract: whether or not to extract a docx file. True to extract, False to work on a pre-extracted folder @param run_proprietary: whether or not to run proprietary math transforms """ # make output folders self.gv.mk_dir(self.gv.docx_temp_folder_path) self.gv.mk_dir(self.gv.common2_temp_folder_path) self.gv.mk_dir(self.gv.tei_folder_path) #copy folders self.gv.copy_folder(self.gv.common2_lib_path, self.gv.common2_temp_folder_path, False, None) self.gv.copy_folder(self.gv.docx_folder_path, self.gv.docx_temp_folder_path, False, None) if extract: # decompress the docx self.debug.print_debug(self, u'Unzipping {0} to {1}'.format(self.gv.input_file_path, self.gv.docx_temp_folder_path)) with zipfile.ZipFile(self.gv.input_file_path, "r") as z: z.extractall(self.gv.docx_temp_folder_path) elif not tei: self.gv.copy_folder(self.gv.input_file_path, self.gv.docx_temp_folder_path) else: shutil.copy2(self.gv.input_file_path, self.gv.tei_file_path) self.debug.print_debug(self, u'Looking for presence of media directory {0}'.format(self.gv.docx_media_path)) if os.path.isdir(self.gv.docx_media_path): self.debug.print_debug(self, u'Ripping out media directory') self.gv.mk_dir(self.gv.output_media_path) self.gv.copy_folder(self.gv.docx_media_path, self.gv.output_media_path, False, None) if not self.gv.settings.args['--noimageprocessing']: self.handle_wmf() # copy input file into the docx subfolder if extract: shutil.copy(self.gv.input_file_path, self.gv.docx_temp_folder_path) else: pass #self.gv.tei_file_path = self.gv.tei_file_path + 'tei.xml' if not tei: # fix dud LibreOffice conversion doc_prop = open(os.path.join(self.gv.docx_temp_folder_path, 'docProps', 'core.xml'), 'r+') contents = doc_prop.read() contents = re.sub('\&\s', '\&\s', contents) doc_prop.seek(0) doc_prop.write(contents) doc_prop.truncate() doc_prop.close() if run_proprietary: # run a transform on the copied docx to generate a new version of the Word XML that includes MML java_command = self.saxon_omml_to_mml() self.debug.print_debug(self, u'Running saxon transform (DOCX->MML DOCX) [proprietary]') subprocess.call(java_command, stdin=None, shell=True) self.clean_proprietary() # saxon converter java_command = self.saxon_doc_to_tei() self.debug.print_debug(self, u'Running saxon transform (DOCX->TEI)') subprocess.call(java_command, stdin=None, shell=True) # delete temp folders if not self.gv.debug.debug: shutil.rmtree(self.gv.docx_temp_folder_path) shutil.rmtree(self.gv.common2_temp_folder_path) if os.path.exists(self.gv.unoconv_folder_path): shutil.rmtree(self.gv.unoconv_folder_path) # update path to TEI from normalized saxon output self.gv.tei_file_path = self.gv.settings.clean_path(self.gv.tei_file_path)