in archive/preprocess/growphrase3.py [0:0]
def __init__(self, dbfile, colText, colCnt, min_support = .01):
timer = Timer()
self.min_support = min_support
dbSize = 0
vocab = {}
itemset = []
texts = []
## load data, tokenize the text, hash vocabulary
f = open(dbfile, 'rU')
rdr = csv.reader(f, delimiter = '\t', quotechar='"', escapechar='\\')
fdist = nltk.probability.FreqDist()
for r in rdr:
text = unicode(r[colText], 'utf-8').lower()
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
text = []
for t in tokens:
if not t in stopwords:
if not t in vocab:
vocab[t] = len(itemset)
itemset.append(t)
text.append(vocab[t])
if len(text) > 0:
texts.append((text, num))
dbSize += num
self.dbSize = dbSize
self.vocab = vocab
self.itemset = itemset
self.texts = texts
f.close()
timer.printElapsed()