#!/usr/bin/env python
[docs]__author__ = "Dulip Withanage"
[docs]__email__ = "dulip.withanage@gmail.com"
import re
import string
import sys
import operator
import globals as gv
import os
import subprocess
import shutil
#from django.utils.encoding import smart_str
[docs]class FrontMatterParser:
def __init__(self, gv):
self.gv = gv
[docs] def parse_authors(self, filestring):
# this works for perception-monospace, equations tables, laddering, neoliberalism, snowball, valuechain, sodium
name = re.findall(r'(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)\*?\s){2,5}(&|and|et|und)\s(([A-Za-z\-\.]+)\*?\s?){2,5}(</p>|</bold>|</italic>|\n)',filestring)
if len(name) == 0:
# this works for racialprofiling, antiseptics, eeg_comicsans, leadership, systemsthinker
# this would work for science.doc but there are way too many authors and that affects the string
# would work for rating.doc but need to fix linebreak comments from output
name2 = re.findall(r'(<p>|<bold>|<italic>)(([A-Za-z\-\.]+)(,?\s)){1,20}([A-Za-z\-\.]+)?(</p>|</bold>|</italic>)',filestring)
# this loops through strings and prefers those that occur earlier + have more periods/commas
guess2score = {}
guess2number = 0
for g in name2:
guess2 =''.join(str(e) for e in g)
periods = re.findall(r'\.',guess2)
italics = re.findall(r'italic',guess2)
guess2score[guess2] = len(periods)
guess2score[guess2] += len(italics)
guess2score[guess2] -= guess2number
guess2number += 1
#print operator.itemgetter(1)
print(guess2score.items())
print(type(operator.itemgetter(1)))
name[0] = max(guess2score.items(), key=operator.itemgetter(1))[0]
striptags_name = re.sub(r'<.*>','',name[0])
authorString = re.sub(r'[B|b][Y|y]\s','',striptags_name)
# this is the author string. could try sending to parscit to get individual author names.
return authorString
# entrepreneurship needs fixing, will be tough, has authors in multiple XML elements
[docs] def parse_title(self, filestring):
# need to anticipate which other special characters are allowable in titles
# first, check if a subtitle and title have wound up separated from one another
title = re.findall(r'(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)(,?\s)){1,20}([A-Za-z\-\.]+)?:(</p>|</bold>|</italic>|\n)(.|\s)*?(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)((:|,)?\s)){1,20}([A-Za-z\-\.]+)?\??(</p>|</bold>|</italic>|\n)',filestring)
if len(title) == 0:
# this works for antiseptics, eeg_comicsans, entrepreneurship, laddering, racialprofiling, snowball, sodium
title2 = re.findall(r'(\n|<p>|<bold>|<italic>)(([A-Za-z\-\.]+)((:|,)?\s)){1,20}([A-Za-z\-\.]+)?\??(</p>|</bold>|</italic>|\n)',filestring)
title = title2
#title0 = ''.join(title[0])
title_first= ''.join(title[0])
#remove <> tags
titleString = re.sub(r'<(.*)>','',re.sub(r'</(.*)>','',title_first))
return titleString
[docs] def get_file_text(self, filename):
f = open(filename)
text= f.read()
f.close()
return text
[docs] def update_tmp_file(self):
shutil.copy2(self.gv.NLM_FILE_PATH,self.gv.NLM_TEMP_FILE_PATH)
[docs] def write_output(self, text):
out = open(self.gv.NLM_FILE_PATH,'w')
out.write(text)
out.close()
[docs] def run(self):
text = self.get_file_text(self.gv.NLM_TEMP_FILE_PATH)
#self.parse_authors(text)
self.parse_title(text)
self.write_output(text)
self.update_tmp_file()