in archive/preprocess/clospan.py [0:0]
def closedMining(self, dbfile, colText, colCnt, min_support = .01, ifSaveSeq = False, ifSaveLattice = False):
timer = Timer()
self.min_support = min_support
self.dbfile = dbfile
dbSize = 0
## load data and tokenize the text
f = open(dbfile, 'rU')
rdr = csv.reader(f, delimiter = '\t')
fdist = nltk.probability.FreqDist()
for r in rdr:
text = unicode(r[colText], 'utf-8')
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
for t in tokens:
if t not in stopwords:
fdist.inc(t, num)
dbSize += num
self.dbSize = dbSize
self.fdist = fdist
## turn text into itemset numberings
itemset = []
for w in self.fdist.keys():
if not self._checkMinSupport(self.fdist[w]):
break
if w not in stopwords:
itemset.append(w)
self.itemset = itemset
texts = []
f.seek(0)
for r in rdr:
text = unicode(r[colText], 'utf-8')
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
text = []
for t in tokens:
try:
i = itemset.index(t)
text.append(i)
except ValueError:
pass
if len(text) > 0:
texts.append((text, num))
self.texts = texts
## Initialize the close sequence lattice and hashtable
self.lattice = CloSeq([],0)
self.hash = HashTable()
## start with 1-item sequences
for item in range(0, len(self.itemset)):
D = [0 for i in range(0, len(self.texts))]
Ids = self._findSupportDB(D, item)
self.cloSpan([item], D, Ids, self.lattice)
f.close()
## display results
sid = 0
phrases = []
print "Final closed sequences:"
h = self.hash.getAll()
ks = sorted(h.keys(), reverse = True)
for k in ks:
for v in h[k]:
if v.parents:
phrases.append({"entity":self.printSeq(v.s), "freq":k})
v.id = sid
sid += 1
if len(v.s) > 1:
print str(k) + " " + self.printSeq(v.s)
if ifSaveSeq:
f = open('cloSpanSeqs.txt', 'w')
for k in ks:
for v in h[k]:
if len(v.s) > 1:
f.write( str(k) + " " + self.printSeq(v.s) )
f.close()
if ifSaveLattice:
links = []
tovisit = [self.lattice]
visited = []
while len(tovisit) > 0:
seq = tovisit.pop(0)
#print (seq.Ids, self.printSeq(seq.s))
if seq.children:
# print ' children: ' + str([ (c.Ids, self.printSeq(c.s)) for c in seq.children])
for c in seq.children:
if seq.parents:
# print str(seq.id) + ": " + self.printSeq(seq.id)
# print str(c.id) + ": " + self.printSeq(c.s)
links.append({"source":seq.id, "target":c.id, "freq":c.Ids})
if not c in visited:
tovisit.append(c)
visited.append(seq)
result = {"entities":phrases, "links":links}
f = open('clospanresult.json', 'w')
f.write((json.dumps(result, ensure_ascii=False)))
f.close()
timer.printElapsed()