def closedMining()

in archive/preprocess/clospan.py [0:0]


	def closedMining(self, dbfile, colText, colCnt, min_support = .01, ifSaveSeq = False, ifSaveLattice = False):
		timer = Timer()
 
		self.min_support = min_support
		self.dbfile = dbfile

		dbSize = 0
		## load data and tokenize the text
		f = open(dbfile, 'rU')
		rdr = csv.reader(f, delimiter = '\t')
		fdist = nltk.probability.FreqDist()
		for r in rdr:
			text = unicode(r[colText], 'utf-8')
			tokens = nltk.regexp_tokenize(text, tokenPattern)
			if colCnt < 0:
				num = 1
			else:
				num = int(r[colCnt])
			for t in tokens:
				if t not in stopwords:
					fdist.inc(t, num)
			dbSize += num		
		self.dbSize = dbSize
		self.fdist = fdist

		## turn text into itemset numberings				
		itemset = []
		for w in self.fdist.keys():
			if not self._checkMinSupport(self.fdist[w]):
				break
			if w not in stopwords:
				itemset.append(w)
		self.itemset = itemset

		texts = []
		f.seek(0)
		for r in rdr:
			text = unicode(r[colText], 'utf-8')
			tokens = nltk.regexp_tokenize(text, tokenPattern)
			if colCnt < 0:
				num = 1
			else:
				num = int(r[colCnt])
			text = []
			for t in tokens:
				try:
					i = itemset.index(t)
					text.append(i)
				except ValueError:
					pass
			if len(text) > 0:
				texts.append((text, num))
		self.texts = texts

		## Initialize the close sequence lattice and hashtable
		self.lattice = CloSeq([],0)
		self.hash = HashTable()

		## start with 1-item sequences 
		for item in range(0, len(self.itemset)):
			D = [0 for i in range(0, len(self.texts))]
			Ids = self._findSupportDB(D, item)
			self.cloSpan([item], D, Ids, self.lattice)

		f.close()

		## display results
		sid = 0
		phrases = []
		print "Final closed sequences:"
		h = self.hash.getAll()
		ks = sorted(h.keys(), reverse = True)
		for k in ks:
			for v in h[k]:
				if v.parents:
					phrases.append({"entity":self.printSeq(v.s), "freq":k})
					v.id = sid
					sid += 1
				if len(v.s) > 1:
					print str(k) + " " + self.printSeq(v.s)

		if ifSaveSeq:
			f = open('cloSpanSeqs.txt', 'w')
			for k in ks:
				for v in h[k]:
					if len(v.s) > 1:
						f.write( str(k) + " " + self.printSeq(v.s) )
			f.close()
		
		if ifSaveLattice:
			links = []
			tovisit = [self.lattice]
			visited = []
			while len(tovisit) > 0:
				seq = tovisit.pop(0)
				#print (seq.Ids, self.printSeq(seq.s))
				if seq.children:
				#	print '  children: ' + str([ (c.Ids, self.printSeq(c.s)) for c in seq.children])
					for c in seq.children:
						if seq.parents:
							# print str(seq.id) + ": " + self.printSeq(seq.id)
							# print str(c.id) + ": " + self.printSeq(c.s)
							links.append({"source":seq.id, "target":c.id, "freq":c.Ids})
						if not c in visited:
							tovisit.append(c)
				visited.append(seq)
			result = {"entities":phrases, "links":links}
			f = open('clospanresult.json', 'w')
			f.write((json.dumps(result, ensure_ascii=False)))
			f.close()

		timer.printElapsed()