def __init__()

in archive/preprocess/growphrase.py [0:0]


	def __init__(self, dbfile, colText, colCnt, min_support = .01):
		timer = Timer()
 
		self.min_support = min_support

		dbSize = 0
		## load data and tokenize the text
		f = open(dbfile, 'rU')
		rdr = csv.reader(f, delimiter = '\t')
		fdist = nltk.probability.FreqDist()
		for r in rdr:
			text = unicode(r[colText], 'utf-8')
			tokens = nltk.regexp_tokenize(text, tokenPattern)
			if colCnt < 0:
				num = 1
			else:
				num = int(r[colCnt])
			for t in tokens:
				if t not in stopwords:
					fdist.inc(t, num)
			dbSize += num		
		self.dbSize = dbSize
		self.fdist = fdist

		## turn text into itemset numberings				
		itemset = []
		for w in self.fdist.keys():
			if not self._checkMinSupport(self.fdist[w]):
				break
			if w not in stopwords:
				itemset.append(w)
		self.itemset = itemset

		texts = []
		f.seek(0)
		for r in rdr:
			text = unicode(r[colText], 'utf-8')
			tokens = nltk.regexp_tokenize(text, tokenPattern)
			if colCnt < 0:
				num = 1
			else:
				num = int(r[colCnt])
			text = []
			for t in tokens:
				try:
					i = itemset.index(t)
					text.append(i)
				except ValueError:
					pass
			if len(text) > 0:
				texts.append((text, num))
		self.texts = texts
		f.close()
		timer.printElapsed()