https://github.com/recski/HunTag
Tip revision: ac681ac97f8677f25c29060fc8a168d68aca22d9 authored by Gábor Recski on 18 January 2016, 08:52:12 UTC
Update README.md
Update README.md
Tip revision: ac681ac
lexicon.py
#lexicon.py is a module of HunTag and is called by the module feature.py
#the Lexicon class generates so-called lexicon features
#an instance of Lexicon() should be initialized for each lexicon file
import sys
class Lexicon():
def __init__(self, file):
#sys.stderr.write('opening '+file+'\n')
self.list = set()
self.endParts = set()
self.midParts = set()
self.startParts = set()
for line in open(file):
phrase = line.strip()
self.list.add(phrase)
words = line.split()
if len(words)>1:
self.endParts.add(words[-1])
self.startParts.add(words[0])
if len(words)>2:
for w in words[1:-1]:
self.midParts.add(w)
def getWordFeats(self, word):
wordFeats = []
if word in self.list:
wordFeats.append('lone')
if word in self.endParts:
wordFeats.append('end')
if word in self.startParts:
wordFeats.append('start')
if word in self.midParts:
wordFeats.append('mid')
return wordFeats
def lexEvalSentence(self, sentence):
featVec=[]
for word in sentence:
featVec.append(self.getWordFeats(word))
return featVec