in archive/preprocess/growphrase.py [0:0]
def __init__(self, dbfile, colText, colCnt, min_support = .01):
timer = Timer()
self.min_support = min_support
dbSize = 0
## load data and tokenize the text
f = open(dbfile, 'rU')
rdr = csv.reader(f, delimiter = '\t')
fdist = nltk.probability.FreqDist()
for r in rdr:
text = unicode(r[colText], 'utf-8')
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
for t in tokens:
if t not in stopwords:
fdist.inc(t, num)
dbSize += num
self.dbSize = dbSize
self.fdist = fdist
## turn text into itemset numberings
itemset = []
for w in self.fdist.keys():
if not self._checkMinSupport(self.fdist[w]):
break
if w not in stopwords:
itemset.append(w)
self.itemset = itemset
texts = []
f.seek(0)
for r in rdr:
text = unicode(r[colText], 'utf-8')
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
text = []
for t in tokens:
try:
i = itemset.index(t)
text.append(i)
except ValueError:
pass
if len(text) > 0:
texts.append((text, num))
self.texts = texts
f.close()
timer.printElapsed()