def __init__()

in archive/preprocess/growphrase2.py [0:0]


	def __init__(self, dbfile, colText, colCnt, min_support = .01):
		timer = Timer()
 
		self.min_support = min_support

		dbSize = 0
		vocab = {}
		itemset = []
		texts = []
		## load data, tokenize the text, hash vocabulary
		f = open(dbfile, 'rU')
		rdr = csv.reader(f, delimiter = '\t')
		fdist = nltk.probability.FreqDist()
		for r in rdr:
			text = unicode(r[colText], 'utf-8')
			tokens = nltk.regexp_tokenize(text, tokenPattern)
			if colCnt < 0:
				num = 1
			else:
				num = int(r[colCnt])
			text = []
			for t in tokens:
				if not t in stopwords:
					if not t in vocab:
						vocab[t] = len(itemset)
						itemset.append(t)
					text.append(vocab[t])
			if len(text) > 0:
				texts.append((text, num))
			dbSize += num		
		self.dbSize = dbSize
		self.vocab = vocab
		self.itemset = itemset
		self.texts = texts
		f.close()
		timer.printElapsed()