FP-Growth

The class treeNode is a class definition for the nodes of FP-Tree and has variables to hold the count and the name of the node. The nodeLink is used to link similar items and parent is used to link to the parent treeNode in the FP-Tree.

In [1]:
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode      #needs to be updated
        self.children = {} 
    
    def inc(self, numOccur):
        self.count += numOccur
        
    def disp(self, ind=1):
        print '  '*ind, self.name, ' ', self.count
        for child in self.children.values():
            child.disp(ind+1)

The createTree function takes a dataset and the minimum support value and builds an FP-tree, includuing the tree itself and a header table.

In [2]:
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
    headerTable = {}
    #go over dataSet twice
    
    #first pass counts frequency of occurance
    for trans in dataSet:
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
        
    #remove items not meeting minSup
    for k in headerTable.keys():  
        if headerTable[k] < minSup: 
            del(headerTable[k])
            
    freqItemSet = set(headerTable.keys())
    #print 'freqItemSet: ',freqItemSet
    
    #if no items meet min support -->get out
    if len(freqItemSet) == 0: 
        return None, None  
    
    #reformat headerTable to use Node link 
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] 
    #print 'headerTable: ',headerTable
    
    retTree = treeNode('Null Set', 1, None) #create tree
    
    #go through dataset 2nd time
    for tranSet, count in dataSet.items():  
        localD = {}
        
        #put transaction items in order
        for item in tranSet:  
            if item in freqItemSet:
                localD[item] = headerTable[item][0]
                
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
            #populate tree with ordered freq itemset
            updateTree(orderedItems, retTree, headerTable, count)
    
    #return tree and header table
    return retTree, headerTable 

1

In [3]:
def updateTree(items, inTree, headerTable, count):
    #check if orderedItems[0] in retTree.children
    if items[0] in inTree.children:
        #incrament count
        inTree.children[items[0]].inc(count) 
    else:   
        #add items[0] to inTree.children
        inTree.children[items[0]] = treeNode(items[0], count, inTree)
        #update header table 
        if headerTable[items[0]][1] == None:
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    
    #call updateTree() with remaining ordered items
    if len(items) > 1:
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)
In [4]:
#this version does not use recursion
#Do not use recursion to traverse a linked list!
#the linked list might be too long and you reach the limits :-(
def updateHeader(nodeToTest, targetNode):   
    while (nodeToTest.nodeLink != None):    
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode

Let's test the code!

In [5]:
# a simple dataset to illustrate the algorithm
def loadSimpDat():
    simpDat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['z'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    return simpDat
In [6]:
# We need the dataset in a special format, let's do it
# a dictionary with transactions as key and count as value
def createInitSet(dataSet):
    retDict = {}
    for trans in dataSet:
        retDict[frozenset(trans)] = 1
    return retDict

Now we are ready to test it

In [7]:
simpDat = loadSimpDat()
simpDat
Out[7]:
[['r', 'z', 'h', 'j', 'p'],
 ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
 ['z'],
 ['r', 'x', 'n', 'o', 's'],
 ['y', 'r', 'x', 'z', 'q', 't', 'p'],
 ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
In [8]:
initSet = createInitSet(simpDat)
initSet
Out[8]:
{frozenset({'e', 'm', 'q', 's', 't', 'x', 'y', 'z'}): 1,
 frozenset({'n', 'o', 'r', 's', 'x'}): 1,
 frozenset({'z'}): 1,
 frozenset({'s', 't', 'u', 'v', 'w', 'x', 'y', 'z'}): 1,
 frozenset({'p', 'q', 'r', 't', 'x', 'y', 'z'}): 1,
 frozenset({'h', 'j', 'p', 'r', 'z'}): 1}
In [9]:
myFPtree, myHeaderTable = createTree(initSet, 3) # min-sup = 3
In [10]:
myFPtree.disp()
   Null Set   1
     x   1
       s   1
         r   1
     z   5
       x   3
         y   3
           s   2
             t   2
           r   1
             t   1
       r   1
In [11]:
myHeaderTable
Out[11]:
{'r': [3, <__main__.treeNode instance at 0x0000000003CD33C8>],
 's': [3, <__main__.treeNode instance at 0x0000000003CD3288>],
 't': [3, <__main__.treeNode instance at 0x0000000003CD3308>],
 'x': [4, <__main__.treeNode instance at 0x0000000003CD31C8>],
 'y': [3, <__main__.treeNode instance at 0x0000000003CD32C8>],
 'z': [5, <__main__.treeNode instance at 0x0000000003CBEFC8>]}

Now, let's mine the FP-tree

We need to do three basic steps:

  1. get conditional pattern bases from the FP-tree
  2. from the condditional pattern base, construct a conditional FP-tree
  3. recursively repeat steps 1 and 2 until the tree contains a single item

Let's find the conditional pattern base, it is a collection of paths from the root that end with an specific item, i.e. it is a prefix path.

In [12]:
#treeNode comes from header table
def findPrefixPath(treeNode):
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode, prefixPath)
        if len(prefixPath) > 1: 
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink
    return condPats
In [13]:
#ascends from leaf node to root
def ascendTree(leafNode, prefixPath):
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent, prefixPath)

Let's test it

In [14]:
findPrefixPath(myHeaderTable['x'][1])
Out[14]:
{frozenset({'z'}): 3}
In [15]:
findPrefixPath(myHeaderTable['z'][1])
Out[15]:
{}
In [16]:
findPrefixPath(myHeaderTable['r'][1])
Out[16]:
{frozenset({'s', 'x'}): 1, frozenset({'z'}): 1, frozenset({'x', 'y', 'z'}): 1}

Now, let's mine the tree

In [17]:
def mineTree(inTree, headerTable, minSup, preFix, freqItemList, verbose=False):
    #(sort header table)
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
    #start from bottom of header table
    for basePat in bigL:
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        #print 'finalFrequent Item: ',newFreqSet    
        
        #append to set
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(headerTable[basePat][1]) # basePat
        #print 'condPattBases :',basePat, condPattBases
        
        #2. construct cond FP-tree from cond. pattern base
        myCondTree, myHead = createTree(condPattBases, minSup)
        #print 'head from conditional tree: ', myHead
        
        #3. mine cond. FP-tree
        if myHead != None: 
            if verbose:
                print 'conditional tree for: ',newFreqSet
                myCondTree.disp(1)            
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)

OK! It's time to spark the code ...

In [19]:
# list to store the frequent itsets
freqItems = []

mineTree(myFPtree, myHeaderTable, 3, set([]), freqItems, verbose=True)
conditional tree for:  set(['s'])
   Null Set   1
     x   3
conditional tree for:  set(['y'])
   Null Set   1
     x   3
       z   3
conditional tree for:  set(['t'])
   Null Set   1
     y   3
       x   3
         z   3
conditional tree for:  set(['x'])
   Null Set   1
     z   3
In [20]:
freqItems
Out[20]:
[{'s'},
 {'s', 'x'},
 {'y'},
 {'x', 'y'},
 {'y', 'z'},
 {'x', 'y', 'z'},
 {'t'},
 {'t', 'y'},
 {'t', 'x'},
 {'t', 'x', 'y'},
 {'t', 'z'},
 {'t', 'y', 'z'},
 {'t', 'x', 'z'},
 {'t', 'x', 'y', 'z'},
 {'r'},
 {'x'},
 {'x', 'z'},
 {'z'}]

Work in the Wild

a helper to delete useless characters and URLS and generate a list of words

In [21]:
import re

def textParse(bigString):
    urlsRemoved = re.sub('(http(s)*:[/][/]|www.)([a-z]|[A-Z]|[0-9]|[/.]|[~])*', '', bigString)    
    listOfTokens = re.split(r'\W*', urlsRemoved)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]
In [22]:
# reading the dataset
tweets = []
with open('tweets200000.txt', 'r') as f:
    for line in f:
        tweets.append(textParse(line))
In [23]:
tweets[0]
Out[23]:
['wind',
 'mph',
 'nne',
 'barometer',
 'rising',
 'slowly',
 'temperature',
 'rain',
 'today',
 'humidity']
In [24]:
len(tweets)
Out[24]:
225318
In [25]:
min_sup = 5

initSetTweets = createInitSet(tweets[0:1500])
tweetFPtree, tweetHeaderTable = createTree(initSetTweets, min_sup)

tweetFreqList = []
mineTree(tweetFPtree, tweetHeaderTable, min_sup, set([]), tweetFreqList)
In [26]:
len(tweetFreqList)
Out[26]:
16483
In [27]:
import numpy as np

length_of_tweets = [len(tw) for tw in tweets]
print np.min(length_of_tweets), np.max(length_of_tweets), np.sum(length_of_tweets)/len(length_of_tweets)
0 27 8
In [28]:
length_of_tweets[:10]
Out[28]:
[10, 8, 0, 2, 9, 5, 7, 13, 4, 16]
In [29]:
tweets[:10]
Out[29]:
[['wind',
  'mph',
  'nne',
  'barometer',
  'rising',
  'slowly',
  'temperature',
  'rain',
  'today',
  'humidity'],
 ['pausa',
  'pro',
  'caf',
  'antes',
  'embarcar',
  'ximo',
  'trippolisontheroad',
  'danipolisviaja'],
 [],
 ['pause', 'for'],
 ['good',
  'morning',
  'morning',
  'saturday',
  'diner',
  'breakfast',
  'nucorpsofcadetsring',
  'ring',
  'college'],
 ['gratefuldead', 'recordstoredayus', 'toms', 'music', 'trade'],
 ['egg', 'muffin', 'rocket', 'baby', 'bakery', 'rocketbabybaker', 'wauwatosa'],
 ['lyricwaters',
  'should',
  'gave',
  'the',
  'neighbor',
  'buzz',
  'got',
  'ice',
  'cream',
  'and',
  'moms',
  'baked',
  'goodies'],
 ['the', 'way', 'mamaroneck', 'mamaroneck'],
 ['hiring',
  'read',
  'about',
  'our',
  'latest',
  'job',
  'opening',
  'here',
  'retail',
  'sales',
  'consultant',
  'cwa',
  'mob',
  'bryn',
  'mawr',
  'retail']]
In [31]:
with open('tweets200000.txt', 'r') as f:
    print f.readlines()[:10]
['"Wind 3.2 mph NNE. Barometer 30.20 in, Rising slowly. Temperature 49.3 \xf8F. Rain today 0.00 in. Humidity 32%"\n', '"Pausa pro caf\x82 antes de embarcar no pr\xa2ximo v\x93o. #trippolisontheroad #danipolisviaja \n', '\n', 'Pause for? https://t.co/PhcJ4oYktP"\n', 'Good. Morning. #morning #Saturday #diner #VT #breakfast #nucorpsofcadetsring #ring #college? https://t.co/dBZ7dbwX6f\n', '@gratefuldead recordstoredayus ?????? @ TOMS MUSIC TRADE https://t.co/CURRmn6iJo\n', '"Egg in a muffin!!! (@ Rocket Baby Bakery - @rocketbabybaker in Wauwatosa, WI) https://t.co/mwfhrcxtRp"\n', "@lyricwaters should've gave the neighbor  a buzz. Iv got ice cream and moms baked goodies ??\n", '"On the way to CT! (@ Mamaroneck, NY in Mamaroneck, NY) https://t.co/6rpe6MXDkB"\n', "We're #hiring! Read about our latest #job opening here: Retail Sales Consultant [CWA MOB] Bryn Mawr PA - https://t.co/bBwxSPsL4f #Retail\n"]
In [33]:
tweetFreqList[0:20]
Out[33]:
[{'cincinnati'},
 {'trainee'},
 {'hiring', 'trainee'},
 {'job', 'trainee'},
 {'hiring', 'job', 'trainee'},
 {'retail', 'trainee'},
 {'hiring', 'retail', 'trainee'},
 {'job', 'retail', 'trainee'},
 {'hiring', 'job', 'retail', 'trainee'},
 {'jobs', 'trainee'},
 {'hiring', 'jobs', 'trainee'},
 {'job', 'jobs', 'trainee'},
 {'hiring', 'job', 'jobs', 'trainee'},
 {'jobs', 'retail', 'trainee'},
 {'hiring', 'jobs', 'retail', 'trainee'},
 {'job', 'jobs', 'retail', 'trainee'},
 {'hiring', 'job', 'jobs', 'retail', 'trainee'},
 {'fallschurch'},
 {'line'},
 {'cafe'}]
In [34]:
simpDat
Out[34]:
[['r', 'z', 'h', 'j', 'p'],
 ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
 ['z'],
 ['r', 'x', 'n', 'o', 's'],
 ['y', 'r', 'x', 'z', 'q', 't', 'p'],
 ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
In [35]:
min_sup
Out[35]:
5
In [36]:
min_sup = 2
In [37]:
f1 = {}
for rec in simpDat:
    for item in rec:
        if item not in f1: 
            f1[item] = 1
        else:
            f1[item] += 1
In [38]:
f1
Out[38]:
{'e': 1,
 'h': 1,
 'j': 1,
 'm': 1,
 'n': 1,
 'o': 1,
 'p': 2,
 'q': 2,
 'r': 3,
 's': 3,
 't': 3,
 'u': 1,
 'v': 1,
 'w': 1,
 'x': 4,
 'y': 3,
 'z': 5}
In [42]:
for item in f1.keys():
    if f1[item] < min_sup:
        del f1[item]
        
f1
Out[42]:
{'p': 2, 'q': 2, 'r': 3, 's': 3, 't': 3, 'x': 4, 'y': 3, 'z': 5}
In [43]:
c2 = []
for i in f1:
    for j in f1:
        if i != j:
            c2.append(tuple([i,j]))
            
c2
Out[43]:
[('q', 'p'),
 ('q', 's'),
 ('q', 'r'),
 ('q', 't'),
 ('q', 'y'),
 ('q', 'x'),
 ('q', 'z'),
 ('p', 'q'),
 ('p', 's'),
 ('p', 'r'),
 ('p', 't'),
 ('p', 'y'),
 ('p', 'x'),
 ('p', 'z'),
 ('s', 'q'),
 ('s', 'p'),
 ('s', 'r'),
 ('s', 't'),
 ('s', 'y'),
 ('s', 'x'),
 ('s', 'z'),
 ('r', 'q'),
 ('r', 'p'),
 ('r', 's'),
 ('r', 't'),
 ('r', 'y'),
 ('r', 'x'),
 ('r', 'z'),
 ('t', 'q'),
 ('t', 'p'),
 ('t', 's'),
 ('t', 'r'),
 ('t', 'y'),
 ('t', 'x'),
 ('t', 'z'),
 ('y', 'q'),
 ('y', 'p'),
 ('y', 's'),
 ('y', 'r'),
 ('y', 't'),
 ('y', 'x'),
 ('y', 'z'),
 ('x', 'q'),
 ('x', 'p'),
 ('x', 's'),
 ('x', 'r'),
 ('x', 't'),
 ('x', 'y'),
 ('x', 'z'),
 ('z', 'q'),
 ('z', 'p'),
 ('z', 's'),
 ('z', 'r'),
 ('z', 't'),
 ('z', 'y'),
 ('z', 'x')]
In [ ]: