Source code for bin.frontmatterparser

#!/usr/bin/env python

[docs]__author__ = "Dulip Withanage"
[docs]__email__ = "dulip.withanage@gmail.com"
import re import string import sys import operator import globals as gv import os import subprocess import shutil #from django.utils.encoding import smart_str
[docs]class FrontMatterParser: def __init__(self, gv): self.gv = gv
[docs] def parse_authors(self, filestring): # this works for perception-monospace, equations tables, laddering, neoliberalism, snowball, valuechain, sodium name = re.findall(r'(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)\*?\s){2,5}(&|and|et|und)\s(([A-Za-z\-\.]+)\*?\s?){2,5}(</p>|</bold>|</italic>|\n)',filestring) if len(name) == 0: # this works for racialprofiling, antiseptics, eeg_comicsans, leadership, systemsthinker # this would work for science.doc but there are way too many authors and that affects the string # would work for rating.doc but need to fix linebreak comments from output name2 = re.findall(r'(<p>|<bold>|<italic>)(([A-Za-z\-\.]+)(,?\s)){1,20}([A-Za-z\-\.]+)?(</p>|</bold>|</italic>)',filestring) # this loops through strings and prefers those that occur earlier + have more periods/commas guess2score = {} guess2number = 0 for g in name2: guess2 =''.join(str(e) for e in g) periods = re.findall(r'\.',guess2) italics = re.findall(r'italic',guess2) guess2score[guess2] = len(periods) guess2score[guess2] += len(italics) guess2score[guess2] -= guess2number guess2number += 1 #print operator.itemgetter(1) print(guess2score.items()) print(type(operator.itemgetter(1))) name[0] = max(guess2score.items(), key=operator.itemgetter(1))[0] striptags_name = re.sub(r'<.*>','',name[0]) authorString = re.sub(r'[B|b][Y|y]\s','',striptags_name) # this is the author string. could try sending to parscit to get individual author names. return authorString
# entrepreneurship needs fixing, will be tough, has authors in multiple XML elements
[docs] def parse_title(self, filestring): # need to anticipate which other special characters are allowable in titles # first, check if a subtitle and title have wound up separated from one another title = re.findall(r'(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)(,?\s)){1,20}([A-Za-z\-\.]+)?:(</p>|</bold>|</italic>|\n)(.|\s)*?(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)((:|,)?\s)){1,20}([A-Za-z\-\.]+)?\??(</p>|</bold>|</italic>|\n)',filestring) if len(title) == 0: # this works for antiseptics, eeg_comicsans, entrepreneurship, laddering, racialprofiling, snowball, sodium title2 = re.findall(r'(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)((:|,)?\s)){1,20}([A-Za-z\-\.]+)?\??(</p>|</bold>|</italic>|\n)',filestring) title = title2 #title0 = ''.join(title[0]) title_first= ''.join(title[0]) #remove <> tags titleString = re.sub(r'<(.*)>','',re.sub(r'</(.*)>','',title_first)) return titleString
[docs] def get_file_text(self, filename): f = open(filename) text= f.read() f.close() return text
[docs] def update_tmp_file(self): shutil.copy2(self.gv.NLM_FILE_PATH,self.gv.NLM_TEMP_FILE_PATH)
[docs] def write_output(self, text): out = open(self.gv.NLM_FILE_PATH,'w') out.write(text) out.close()
[docs] def run(self): text = self.get_file_text(self.gv.NLM_TEMP_FILE_PATH) #self.parse_authors(text) self.parse_title(text) self.write_output(text) self.update_tmp_file()