def growSets()

in archive/preprocess/growphrase.py [0:0]


	def growSets(self):
		timer = Timer()

		groups = []
		nodes = []
		links = {} #adjacency list
		texts = self.texts

		#init sets
		g0 = {'seq':[], 'newItemPos':None, 'size':self.dbSize, 'DBs': [ {'text':t[0], 'count':t[1], 'seqIndices':[]} for t in texts ] }
		groups.append(g0)

		#growSets
		while groups:
			g = groups.pop()
			#print 'grow: ' + str(g['seq']) + ' ' + str(g['size'])

			pos = -1
			word = None
			cnt = 0
			for s in range(len(g['seq']) + 1):
				#print "s = " + str(s);
				fdist = nltk.probability.FreqDist()
				for t in g['DBs']:
					if s == 0:
						l = 0
					else:
						l = t['seqIndices'][s-1] + 1
					if s == len(g['seq']):
						r = len(t['text'])
					else:
						r = t['seqIndices'][s]
					for w in t['text'][l:r]:
						fdist.inc(w, t['count'])
					#print self.printSeq(t['text'][l:r])
				if fdist.N() > 0 and fdist[ fdist.max() ] > cnt:
					pos = s
					word = fdist.max()
					cnt = fdist[word]

			if not self._checkMinSupport(cnt): #could not find new item with enough support, discard branch
				continue
			#print str(pos) + " : " + self.itemset[word] + " : " + str(cnt)

			if cnt == g['DBs'][0]['count']:
				#take the entirety of the top tweet
				t = g['DBs'][0]
				tnodes = []
				for i in range(0, len(t['text'])):
					try:
						j = t['seqIndices'].index(i)
						tnodes.append(g['seq'][j])
					except ValueError:
						newWord = {'entity':self.itemset[t['text'][i]], 'freq':cnt, 'id':len(nodes)}
						nodes.append(newWord)
						tnodes.append(newWord)
				for l in range(0, len(t['text'])-1):
					if not l in t['seqIndices'] or not (l+1) in t['seqIndices']:
						if not tnodes[l]['id'] in links:
							links[ tnodes[l]['id'] ] = {}
						links[ tnodes[l]['id'] ][ tnodes[l+1]['id'] ] = cnt
				for l in range(0, len(t['seqIndices'])-1):
					if t['seqIndices'][l+1] - t['seqIndices'][l] > 1:
						links[tnodes[t['seqIndices'][l]]['id']][tnodes[t['seqIndices'][l+1]]['id']] -= cnt

				if self._checkMinSupport(g['size']-cnt):
					g0 = {'seq': g['seq'], 'newItemPos':None, 'size': g['size']-cnt, 'DBs': g['DBs'][1:]}
					self._insertIntoSortedList(groups, g0)
			else:
				g0 = {'seq': g['seq'], 'newItemPos':None, 'size': g['size']-cnt, 'DBs': []}
				#add new node
				newWord = {'entity':self.itemset[word], 'freq':cnt, 'id':len(nodes)} 
				nodes.append(newWord)
				newseq = list(g['seq'])
				newseq.insert(pos, newWord)			
				g1 = {'seq': newseq, 'newItemPos': pos, 'size':cnt, 'DBs': []}
				#add new links
				if pos <= 0:
					if g['seq']:
						links[newWord['id']] = {g['seq'][0]['id']:cnt}
				elif pos >= len(g['seq']):
					if not g['seq'][-1]['id'] in links:
						links[g['seq'][-1]['id']] = {}
					links[g['seq'][-1]['id']][newWord['id']] = cnt
				else:
					links[g['seq'][pos-1]['id']][g['seq'][pos]['id']] -= cnt #?
					links[g['seq'][pos-1]['id']][newWord['id']] = cnt
					links[newWord['id']]={g['seq'][pos]['id']:cnt}

				for t in g['DBs']:
					if pos == 0:
						l = 0
					else:
						l = t['seqIndices'][pos-1] + 1
					if pos == len(g['seq']):
						r = len(t['text'])
					else:
						r = t['seqIndices'][pos]
					try:
						i = l + t['text'][l:l+r].index(word)
						t['seqIndices'].insert(pos, i)
						g1['DBs'].append(t)
					except ValueError:
						g0['DBs'].append(t)
				#print 'g0: ' + str(g0['seq']) + ' ' + str(g0['newItemPos']) + ' ' + str(g0['size']) + ' DBs: ' + str(g0['DBs'][:3])
				#print 'g1: ' + str(g1['seq']) + ' ' + str(g1['newItemPos']) + ' ' + str(g1['size']) + ' DBs: ' + str(g1['DBs'][:3])

				self._insertIntoSortedList(groups, g1)
				if self._checkMinSupport( g0['size'] ):
					self._insertIntoSortedList(groups, g0)


			# for g in groups:
			# 	 print '    ' + self.printSeq(g['seq']) + ' ' + str(g['size'])

		self.nodes = nodes
		self.links = []
		for l in links.keys():
			for r in links[l].keys():
				self.links.append({'source':l, 'target':r, 'freq':links[l][r]})
		results = {'entities':self.nodes, 'links':self.links}
		f = open('growPhraseResults.json', 'w')
		f.write((json.dumps(results, ensure_ascii=False)))
		f.close()

		timer.printElapsed()