archive/preprocess/growphrase.py (256 lines of code) (raw):
import bisect, csv, json, nltk, sys, time
#tweet token patterns
tokenPattern = r'''(?x) # set flag to allow verbose regexps
http://t\.co/\w+ # urls
|http://t\.co\w+ # urls
|http://t\.\w+ # urls
|http://\w+ # urls
| \@\w+ # Twitter handles
| \#\w+ # hashtags
| \d+(,\d+)+ # digits with internal commas
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| ([A-Z]\.)+ # abbreviations, e.g. U.S.A
'''
stopwords = nltk.corpus.stopwords.words('english') + ['rt', 'via', 'amp', 'http', 'https']
class Timer(object):
def __init__(self):
self.s = time.time()
self.e = None
self.elapsed = None
def start(self):
self.s = time.time()
def end(self):
self.e = time.time()
if self.s:
self.elapsed = self.e - self.s
def printElapsed(self):
self.end()
if self.elapsed:
print "Elapsed time = " + str(self.elapsed) + " sec."
class FDist(object):
def __init__(self):
self.hashtable = {}
def add(self, item, n = 1):
h = self.hashtable
if item in h:
h[item] += n
else:
h[item] = n
def freq(self, item):
h = self.hashtable
if item in h:
return h[item]
else:
return 0
def items(self):
h = self.hashtable
items = sorted(h.keys(), key = lambda i: h[i], reverse = True)
return items
class HashTable(object):
def __init__(self):
self.hashtable = {}
def add(self, key, value):
h = self.hashtable
if key in h:
h[key].append(value)
else:
h[key] = [value]
def remove(self, key, value):
h = self.hashtable
if key in h and value in h[key]:
h[key].remove(value)
if len(h[key]) == 0:
del h[key]
def replace(self, key, values):
if len(values) > 0:
self.hashtable[key] = values
elif key in self.hashtable:
del self.hashtable[key]
def pop(self, key):
h = self.hashtable
r = None
if key in h:
r = h[key]
del h[key]
return r
def get(self, key):
h = self.hashtable
if key in h and len(h[key]) > 0:
return h[key]
else:
return None
def getAll(self):
return self.hashtable
def displayAll(self):
ks = sorted(self.hashtable.keys(), reverse = True)
for k in ks:
print str(k) + " > " + str( [ (v.Ids, v.s) for v in self.hashtable[k]] )
class Corpus(object):
def __init__(self, dbfile, colText, colCnt, min_support = .01):
timer = Timer()
self.min_support = min_support
dbSize = 0
## load data and tokenize the text
f = open(dbfile, 'rU')
rdr = csv.reader(f, delimiter = '\t')
fdist = nltk.probability.FreqDist()
for r in rdr:
text = unicode(r[colText], 'utf-8')
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
for t in tokens:
if t not in stopwords:
fdist.inc(t, num)
dbSize += num
self.dbSize = dbSize
self.fdist = fdist
## turn text into itemset numberings
itemset = []
for w in self.fdist.keys():
if not self._checkMinSupport(self.fdist[w]):
break
if w not in stopwords:
itemset.append(w)
self.itemset = itemset
texts = []
f.seek(0)
for r in rdr:
text = unicode(r[colText], 'utf-8')
tokens = nltk.regexp_tokenize(text, tokenPattern)
if colCnt < 0:
num = 1
else:
num = int(r[colCnt])
text = []
for t in tokens:
try:
i = itemset.index(t)
text.append(i)
except ValueError:
pass
if len(text) > 0:
texts.append((text, num))
self.texts = texts
f.close()
timer.printElapsed()
def growSets(self):
timer = Timer()
groups = []
nodes = []
links = {} #adjacency list
texts = self.texts
#init sets
g0 = {'seq':[], 'newItemPos':None, 'size':self.dbSize, 'DBs': [ {'text':t[0], 'count':t[1], 'seqIndices':[]} for t in texts ] }
groups.append(g0)
#growSets
while groups:
g = groups.pop()
#print 'grow: ' + str(g['seq']) + ' ' + str(g['size'])
pos = -1
word = None
cnt = 0
for s in range(len(g['seq']) + 1):
#print "s = " + str(s);
fdist = nltk.probability.FreqDist()
for t in g['DBs']:
if s == 0:
l = 0
else:
l = t['seqIndices'][s-1] + 1
if s == len(g['seq']):
r = len(t['text'])
else:
r = t['seqIndices'][s]
for w in t['text'][l:r]:
fdist.inc(w, t['count'])
#print self.printSeq(t['text'][l:r])
if fdist.N() > 0 and fdist[ fdist.max() ] > cnt:
pos = s
word = fdist.max()
cnt = fdist[word]
if not self._checkMinSupport(cnt): #could not find new item with enough support, discard branch
continue
#print str(pos) + " : " + self.itemset[word] + " : " + str(cnt)
if cnt == g['DBs'][0]['count']:
#take the entirety of the top tweet
t = g['DBs'][0]
tnodes = []
for i in range(0, len(t['text'])):
try:
j = t['seqIndices'].index(i)
tnodes.append(g['seq'][j])
except ValueError:
newWord = {'entity':self.itemset[t['text'][i]], 'freq':cnt, 'id':len(nodes)}
nodes.append(newWord)
tnodes.append(newWord)
for l in range(0, len(t['text'])-1):
if not l in t['seqIndices'] or not (l+1) in t['seqIndices']:
if not tnodes[l]['id'] in links:
links[ tnodes[l]['id'] ] = {}
links[ tnodes[l]['id'] ][ tnodes[l+1]['id'] ] = cnt
for l in range(0, len(t['seqIndices'])-1):
if t['seqIndices'][l+1] - t['seqIndices'][l] > 1:
links[tnodes[t['seqIndices'][l]]['id']][tnodes[t['seqIndices'][l+1]]['id']] -= cnt
if self._checkMinSupport(g['size']-cnt):
g0 = {'seq': g['seq'], 'newItemPos':None, 'size': g['size']-cnt, 'DBs': g['DBs'][1:]}
self._insertIntoSortedList(groups, g0)
else:
g0 = {'seq': g['seq'], 'newItemPos':None, 'size': g['size']-cnt, 'DBs': []}
#add new node
newWord = {'entity':self.itemset[word], 'freq':cnt, 'id':len(nodes)}
nodes.append(newWord)
newseq = list(g['seq'])
newseq.insert(pos, newWord)
g1 = {'seq': newseq, 'newItemPos': pos, 'size':cnt, 'DBs': []}
#add new links
if pos <= 0:
if g['seq']:
links[newWord['id']] = {g['seq'][0]['id']:cnt}
elif pos >= len(g['seq']):
if not g['seq'][-1]['id'] in links:
links[g['seq'][-1]['id']] = {}
links[g['seq'][-1]['id']][newWord['id']] = cnt
else:
links[g['seq'][pos-1]['id']][g['seq'][pos]['id']] -= cnt #?
links[g['seq'][pos-1]['id']][newWord['id']] = cnt
links[newWord['id']]={g['seq'][pos]['id']:cnt}
for t in g['DBs']:
if pos == 0:
l = 0
else:
l = t['seqIndices'][pos-1] + 1
if pos == len(g['seq']):
r = len(t['text'])
else:
r = t['seqIndices'][pos]
try:
i = l + t['text'][l:l+r].index(word)
t['seqIndices'].insert(pos, i)
g1['DBs'].append(t)
except ValueError:
g0['DBs'].append(t)
#print 'g0: ' + str(g0['seq']) + ' ' + str(g0['newItemPos']) + ' ' + str(g0['size']) + ' DBs: ' + str(g0['DBs'][:3])
#print 'g1: ' + str(g1['seq']) + ' ' + str(g1['newItemPos']) + ' ' + str(g1['size']) + ' DBs: ' + str(g1['DBs'][:3])
self._insertIntoSortedList(groups, g1)
if self._checkMinSupport( g0['size'] ):
self._insertIntoSortedList(groups, g0)
# for g in groups:
# print ' ' + self.printSeq(g['seq']) + ' ' + str(g['size'])
self.nodes = nodes
self.links = []
for l in links.keys():
for r in links[l].keys():
self.links.append({'source':l, 'target':r, 'freq':links[l][r]})
results = {'entities':self.nodes, 'links':self.links}
f = open('growPhraseResults.json', 'w')
f.write((json.dumps(results, ensure_ascii=False)))
f.close()
timer.printElapsed()
def _checkMinSupport(self, cnt):
if not hasattr(self, 'dbSize'):
raise NameError("dbSize is not defined.")
if not hasattr(self, 'min_support'):
raise NameError("min_support is not defined.")
if cnt >= self.dbSize * self.min_support:
return True
else:
return False
def printSeq(self, s):
return ' '.join([self.itemset[i] for i in s])
def _insertIntoSortedList(self, sortedlist, item):
i = bisect.bisect_left([l['size'] for l in sortedlist], item['size'])
sortedlist.insert(i, item)
return sortedlist
def main(argv):
c = Corpus('../data/raw/goal1.tsv', 1, 2)
c.growSets()
if __name__ == "__main__":
main(sys.argv[1:])