The class treeNode
is a class definition for the nodes of FP-Tree and has variables to hold the count and the name of the node. The nodeLink
is used to link similar items and parent
is used to link to the parent treeNode
in the FP-Tree.
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode #needs to be updated
self.children = {}
def inc(self, numOccur):
self.count += numOccur
def disp(self, ind=1):
print ' '*ind, self.name, ' ', self.count
for child in self.children.values():
child.disp(ind+1)
The createTree
function takes a dataset and the minimum support value and builds an FP-tree, includuing the tree itself and a header table.
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
headerTable = {}
#go over dataSet twice
#first pass counts frequency of occurance
for trans in dataSet:
for item in trans:
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
#remove items not meeting minSup
for k in headerTable.keys():
if headerTable[k] < minSup:
del(headerTable[k])
freqItemSet = set(headerTable.keys())
#print 'freqItemSet: ',freqItemSet
#if no items meet min support -->get out
if len(freqItemSet) == 0:
return None, None
#reformat headerTable to use Node link
for k in headerTable:
headerTable[k] = [headerTable[k], None]
#print 'headerTable: ',headerTable
retTree = treeNode('Null Set', 1, None) #create tree
#go through dataset 2nd time
for tranSet, count in dataSet.items():
localD = {}
#put transaction items in order
for item in tranSet:
if item in freqItemSet:
localD[item] = headerTable[item][0]
if len(localD) > 0:
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
#populate tree with ordered freq itemset
updateTree(orderedItems, retTree, headerTable, count)
#return tree and header table
return retTree, headerTable
1
def updateTree(items, inTree, headerTable, count):
#check if orderedItems[0] in retTree.children
if items[0] in inTree.children:
#incrament count
inTree.children[items[0]].inc(count)
else:
#add items[0] to inTree.children
inTree.children[items[0]] = treeNode(items[0], count, inTree)
#update header table
if headerTable[items[0]][1] == None:
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
#call updateTree() with remaining ordered items
if len(items) > 1:
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
#this version does not use recursion
#Do not use recursion to traverse a linked list!
#the linked list might be too long and you reach the limits :-(
def updateHeader(nodeToTest, targetNode):
while (nodeToTest.nodeLink != None):
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
Let's test the code!
# a simple dataset to illustrate the algorithm
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
# We need the dataset in a special format, let's do it
# a dictionary with transactions as key and count as value
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
Now we are ready to test it
simpDat = loadSimpDat()
simpDat
initSet = createInitSet(simpDat)
initSet
myFPtree, myHeaderTable = createTree(initSet, 3) # min-sup = 3
myFPtree.disp()
myHeaderTable
Now, let's mine the FP-tree
We need to do three basic steps:
Let's find the conditional pattern base, it is a collection of paths from the root that end with an specific item, i.e. it is a prefix path.
#treeNode comes from header table
def findPrefixPath(treeNode):
condPats = {}
while treeNode != None:
prefixPath = []
ascendTree(treeNode, prefixPath)
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink
return condPats
#ascends from leaf node to root
def ascendTree(leafNode, prefixPath):
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
Let's test it
findPrefixPath(myHeaderTable['x'][1])
findPrefixPath(myHeaderTable['z'][1])
findPrefixPath(myHeaderTable['r'][1])
Now, let's mine the tree
def mineTree(inTree, headerTable, minSup, preFix, freqItemList, verbose=False):
#(sort header table)
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
#start from bottom of header table
for basePat in bigL:
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
#print 'finalFrequent Item: ',newFreqSet
#append to set
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(headerTable[basePat][1]) # basePat
#print 'condPattBases :',basePat, condPattBases
#2. construct cond FP-tree from cond. pattern base
myCondTree, myHead = createTree(condPattBases, minSup)
#print 'head from conditional tree: ', myHead
#3. mine cond. FP-tree
if myHead != None:
if verbose:
print 'conditional tree for: ',newFreqSet
myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
OK! It's time to spark the code ...
# list to store the frequent itsets
freqItems = []
mineTree(myFPtree, myHeaderTable, 3, set([]), freqItems, verbose=True)
freqItems
a helper to delete useless characters and URLS and generate a list of words
import re
def textParse(bigString):
urlsRemoved = re.sub('(http(s)*:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)
listOfTokens = re.split(r'\W*', urlsRemoved)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
# reading the dataset
tweets = []
with open('tweets200000.txt', 'r') as f:
for line in f:
tweets.append(textParse(line))
tweets[0]
len(tweets)
min_sup = 5
initSetTweets = createInitSet(tweets[0:1500])
tweetFPtree, tweetHeaderTable = createTree(initSetTweets, min_sup)
tweetFreqList = []
mineTree(tweetFPtree, tweetHeaderTable, min_sup, set([]), tweetFreqList)
len(tweetFreqList)
import numpy as np
length_of_tweets = [len(tw) for tw in tweets]
print np.min(length_of_tweets), np.max(length_of_tweets), np.sum(length_of_tweets)/len(length_of_tweets)
length_of_tweets[:10]
tweets[:10]
with open('tweets200000.txt', 'r') as f:
print f.readlines()[:10]
tweetFreqList[0:20]
simpDat
min_sup
min_sup = 2
f1 = {}
for rec in simpDat:
for item in rec:
if item not in f1:
f1[item] = 1
else:
f1[item] += 1
f1
for item in f1.keys():
if f1[item] < min_sup:
del f1[item]
f1
c2 = []
for i in f1:
for j in f1:
if i != j:
c2.append(tuple([i,j]))
c2