archive/preprocess/clospan.py (335 lines of code) (raw):

import csv, json, nltk, sys, time #tweet token patterns tokenPattern = r'''(?x) # set flag to allow verbose regexps http://t\.co/\w+ # urls |http://t\.co\w+ # urls |http://t\.\w+ # urls |http://\w+ # urls | \@\w+ # Twitter handles | \#\w+ # hashtags | \d+(,\d+)+ # digits with internal commas | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | ([A-Z]\.)+ # abbreviations, e.g. U.S.A ''' stopwords = nltk.corpus.stopwords.words('english') + ['rt', 'via', 'amp', 'http', 'https'] + ['world', 'cup'] class Timer(object): def __init__(self): self.s = time.time() self.e = None self.elapsed = None def start(self): self.s = time.time() def end(self): self.e = time.time() if self.s: self.elapsed = self.e - self.s def printElapsed(self): self.end() if self.elapsed: print "Elapsed time = " + str(self.elapsed) + " sec." class FDist(object): def __init__(self): self.hashtable = {} def add(self, item, n = 1): h = self.hashtable if item in h: h[item] += n else: h[item] = n def freq(self, item): h = self.hashtable if item in h: return h[item] else: return 0 def items(self): h = self.hashtable items = sorted(h.keys(), key = lambda i: h[i], reverse = True) return items class HashTable(object): def __init__(self): self.hashtable = {} def add(self, key, value): h = self.hashtable if key in h: h[key].append(value) else: h[key] = [value] def remove(self, key, value): h = self.hashtable if key in h and value in h[key]: h[key].remove(value) if len(h[key]) == 0: del h[key] def replace(self, key, values): if len(values) > 0: self.hashtable[key] = values elif key in self.hashtable: del self.hashtable[key] def pop(self, key): h = self.hashtable r = None if key in h: r = h[key] del h[key] return r def get(self, key): h = self.hashtable if key in h and len(h[key]) > 0: return h[key] else: return None def getAll(self): return self.hashtable def displayAll(self): ks = sorted(self.hashtable.keys(), reverse = True) for k in ks: print str(k) + " > " + str( [ (v.Ids, v.s) for v in self.hashtable[k]] ) class CloSeq(object): def __init__(self, s, Ids, p = None): self.s = s self.Ids = Ids self.children = None if p: self.parents = set([p]) if p.children: p.children.add(self) else: p.children = set([self]) else: self.parents = None def addParent(self, p): if self.parents: self.parents.add(p) else: self.parents = set([p]) if p.children: p.children.add(self) else: p.children = set([self]) def consumeSeq(self, old): if not old.parents: raise RuntimeError("Trying to merge the root or a sequence without parents.") if self.parents: self.parents = self.parents.union(old.parents) if old in self.parents: self.parents.remove(old) else: self.parents = old.parents for p in old.parents: p.children.remove(old) p.children.add(self) # if old.children: # print str(self.Ids) + ":" + str(self.s) + " trying to consume " + str(old.Ids)+":"+str(old.s) + " and its children " + str([(c.Ids,c.s) for c in old.children]) # if self.children: # self.children.union(old.children) # else: # self.children = old.children # for c in old.children: # c.parents.remove(old) # c.parents.add(self) class CloSpan(object): def closedMining(self, dbfile, colText, colCnt, min_support = .01, ifSaveSeq = False, ifSaveLattice = False): timer = Timer() self.min_support = min_support self.dbfile = dbfile dbSize = 0 ## load data and tokenize the text f = open(dbfile, 'rU') rdr = csv.reader(f, delimiter = '\t') fdist = nltk.probability.FreqDist() for r in rdr: text = unicode(r[colText], 'utf-8') tokens = nltk.regexp_tokenize(text, tokenPattern) if colCnt < 0: num = 1 else: num = int(r[colCnt]) for t in tokens: if t not in stopwords: fdist.inc(t, num) dbSize += num self.dbSize = dbSize self.fdist = fdist ## turn text into itemset numberings itemset = [] for w in self.fdist.keys(): if not self._checkMinSupport(self.fdist[w]): break if w not in stopwords: itemset.append(w) self.itemset = itemset texts = [] f.seek(0) for r in rdr: text = unicode(r[colText], 'utf-8') tokens = nltk.regexp_tokenize(text, tokenPattern) if colCnt < 0: num = 1 else: num = int(r[colCnt]) text = [] for t in tokens: try: i = itemset.index(t) text.append(i) except ValueError: pass if len(text) > 0: texts.append((text, num)) self.texts = texts ## Initialize the close sequence lattice and hashtable self.lattice = CloSeq([],0) self.hash = HashTable() ## start with 1-item sequences for item in range(0, len(self.itemset)): D = [0 for i in range(0, len(self.texts))] Ids = self._findSupportDB(D, item) self.cloSpan([item], D, Ids, self.lattice) f.close() ## display results sid = 0 phrases = [] print "Final closed sequences:" h = self.hash.getAll() ks = sorted(h.keys(), reverse = True) for k in ks: for v in h[k]: if v.parents: phrases.append({"entity":self.printSeq(v.s), "freq":k}) v.id = sid sid += 1 if len(v.s) > 1: print str(k) + " " + self.printSeq(v.s) if ifSaveSeq: f = open('cloSpanSeqs.txt', 'w') for k in ks: for v in h[k]: if len(v.s) > 1: f.write( str(k) + " " + self.printSeq(v.s) ) f.close() if ifSaveLattice: links = [] tovisit = [self.lattice] visited = [] while len(tovisit) > 0: seq = tovisit.pop(0) #print (seq.Ids, self.printSeq(seq.s)) if seq.children: # print ' children: ' + str([ (c.Ids, self.printSeq(c.s)) for c in seq.children]) for c in seq.children: if seq.parents: # print str(seq.id) + ": " + self.printSeq(seq.id) # print str(c.id) + ": " + self.printSeq(c.s) links.append({"source":seq.id, "target":c.id, "freq":c.Ids}) if not c in visited: tovisit.append(c) visited.append(seq) result = {"entities":phrases, "links":links} f = open('clospanresult.json', 'w') f.write((json.dumps(result, ensure_ascii=False))) f.close() timer.printElapsed() # def storeSeq2tweet(self): # h = self.hash.getAll() # ks = sorted(h.keys(), reverse = True) # for k in ks: # for v in h[k]: # if len(v.s) > 1: # print str(k) + " > " + self.printSeq(v.s) def _printLattice(self): print "Lattice:" tovisit = [self.lattice] visited = [] while len(tovisit) > 0: seq = tovisit.pop(0) print (seq.Ids, self.printSeq(seq.s)) if seq.parents: print ' parents:' + str([ (c.Ids, self.printSeq(c.s)) for c in seq.parents]) if seq.children: print ' children: ' + str([ (c.Ids, self.printSeq(c.s)) for c in seq.children]) for c in seq.children: if not c in visited: tovisit.append(c) visited.append(seq) def cloSpan(self, s, Ds, Ids, parent): texts = self.texts #self._printLattice() #print "\ncloSpan: " + str(Ids) + ":" + str(self.printSeq(s)) ## check if s is a sup/sub-sequence of discovered sequences Ldsc = self.hash.pop(Ids) Lno = [] Lsup = [] Lsub = [] if Ldsc: for dn in Ldsc: c = self._checkSeqContainment(dn.s, s) if c == 0 or c == 1: # s already discovered or discovered seq contains s Lsub.append(dn) elif c == -1: # discovered seq is contained in s Lsup.append(dn) else: # discovered seq and s do not contain each other Lno.append(dn) if len(Lsup) > 0 and len(Lsub) > 0: raise RuntimeError("Conflicting sequences found in Lattice. Current sequence: " + self.printSeq(s) + ", Lsup: " + str([self.printSeq(o.s) for o in Lsup]) + ", Lsub: " + str([self.printSeq(o.s) for o in Lsub])) # add parent to dn's parents, do this for all dn if len(Lsub) > 0: for dn in Lsub: dn.addParent(parent) self.hash.replace(Ids, Ldsc) return seq = CloSeq(s, Ids, parent) # seq takes all of dn's parents if len(Lsup) > 0: for dn in Lsup: seq.consumeSeq(dn) Lno.append(seq) self.hash.replace(Ids, Lno) # add s to tree: parent is previous tree node - need to pass in prev tree node #print str(Ids) + ' + ' + self.printSeq(s) ## grow s: scan DB for next freq items and their supports fdist = FDist() for i in range(0, len(texts)): if Ds[i] >= 0 and Ds[i] < len(texts[i][0]): fdist.add(texts[i][0][Ds[i]], texts[i][1]) for a in fdist.items(): if not self._checkMinSupport(fdist.freq(a)): break ## update suppporting DB Dsa = [] for i in range(0, len(texts)): if Ds[i] < 0 or Ds[i] >= len(texts[i][0]) or texts[i][0][Ds[i]] != a: Dsa.append(-1) else: Dsa.append(Ds[i]+1) sa = list(s) sa.append(a) self.cloSpan(sa, Dsa, fdist.freq(a), seq) def _checkMinSupport(self, cnt): if not hasattr(self, 'dbSize'): raise NameError("dbSize is not defined.") if not hasattr(self, 'min_support'): raise NameError("min_support is not defined.") if cnt >= self.dbSize * self.min_support: return True else: return False ## returns -1: s1 < s2, 0: s1 = s2, 1: s1 > s2, any other value: s1 <> s2 def _checkSeqContainment(self, s1, s2): if len(s1) == len(s2): for i in range(0, len(s1)): if s1[i] != s2[i]: return 2 return 0 elif len(s1) < len(s2): i1 = 0 i2 = 0 while i1 < len(s1) and i2 < len(s2) and len(s1) - i1 <= len(s2) - i2: if s1[i1] == s2[i2]: i1 += 1 i2 += 1 else: i2 += 1 if i1 >= len(s1): return -1 else: return 2 else: # len(s1) > len(s2) i1 = 0 i2 = 0 while i1 < len(s1) and i2 < len(s2) and len(s1) - i1 >= len(s2) - i2: if s1[i1] == s2[i2]: i1 += 1 i2 += 1 else: i1 += 1 if i2 >= len(s2): return 1 else: return 2 ## Ds is the previous supporting DB, a is the new item in the sequence def _findSupportDB(self, Ds, a): Ids = 0 for i in range(0, len(self.texts)): if Ds[i] != -1: try: pos = self.texts[i][0][Ds[i]:].index(a) + Ds[i] + 1 Ds[i] = pos Ids += self.texts[i][1] except ValueError: Ds[i] = -1 return Ids def printSeq(self, s): sp = '' for i in s: sp += self.itemset[i] + ' ' return sp def main(argv): c = CloSpan() #print float(argv[3]) if len(argv)> 3 else 0.01 c.closedMining(argv[0], int(argv[1]), int(argv[2]), float(argv[3]) if len(argv)> 3 else 0.01) if __name__ == "__main__": main(sys.argv[1:])