程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

[Xinghai essays] association rules (II) Python implementation

編輯:Python
def loadDataSet():
return [[1,2,5],[2,4],[2,3],[1,2,4],[1,3],[2,3],[1,3],[1,2,3,5],[1,2,3]]
#1. Build candidate 1 Itemsets C1
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
#
# frozenset It's a frozen collection , It is immutable , There is a hash value .
# The advantage is that it can be used as a dictionary key, It can also be used as an element of other sets .
# The disadvantage is that once created, it cannot be changed , No, add,remove Method .
#
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
# Get a set , For re scanD in , It is convenient to count the probability of each number .
L1, supportData = scanD(dataSet, C1, minSupport)
# Use scanD Function to calculate support , The concept of support is Association rules ( One ) It's been explained in .
L = [L1]
#
# Add one more layer of list 
#[[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})]]
# Be careful {4} No more , Because the probability of his appearance is 25% . Below the minimum support level 50%
#
k = 2
while (len(L[k-2]) > 0):
#
# First entry L If there is, continue 
#
Lk_1 = L[k-2]
# 
Ck = aprioriGen(Lk_1, k)
#
# Loop to get different Set set , It should be a set, so the default de duplication 
# for example : [frozenset({1, 3}), frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
# The second time by [frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 5}), frozenset({2, 4}), frozenset({2, 3}), frozenset({1, 3})]
# third time by [frozenset({1, 2, 5}), frozenset({1, 2, 3})]
#
print("ck:",Ck)
Lk, supK = scanD(dataSet, Ck, minSupport)
#
# The probability that each number will appear in the list , Repeat in a list once 
# Each character is returned , And the probability of each character .
# Circular calculation support 
#
supportData.update(supK)
print("lk:", Lk)
L.append(Lk)
k += 1
return L, supportData
# Filter out collections that do not meet the support level 
# return Frequent itemset list retList Support dictionary for all elements 
# support = ssCnt[key] / numItems
# numItems yes len(D) Result ,D yes data Metadata , Start now load The amount of data .
# Ck It's a different set .
def scanD(D, Ck, minSupport):
# Candidate set count 
ssCnt = {
}
for tid in D:
#
# D What came in was a list .
# The original dataset
#
for can in Ck:
#Ck What's coming in is frozenset type .
#frozenset It's a frozen collection , It is immutable , There is a hash value , The advantage is that it can be used as a dictionary key, It can also be used as an element of other sets . The disadvantage is that once created, it cannot be changed , No, add,remove Method .
if can.issubset(tid):
if can not in ssCnt.keys(): ssCnt[can] = 1
else: ssCnt[can] += 1
#
# The upper half gets each Numbers Number of occurrences , Save in a dictionary .
# ssCnt Dictionaries key by frozenset,volume Is the number of occurrences in the list , If it appears twice in the same list , Count once , It calculates the probability that each list will have this number .
#
numItems = float(len(D))
Lk= []
# Candidate set items Cn Generated frequent itemsets Lk
supportData = {
}
#
# Candidate set items Cn Support Dictionary 
# Calculate the support of candidate item set , supportData key: Candidates , value: Support 
#
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
Lk.append(key)
supportData[key] = support
#
# Judge In the list The specified... Appears key Probability .
# If it is greater than minSupport The retention .
#
return Lk, supportData
#
#return The format is as follows :
#{frozenset({1}): 0.5,
#frozenset({3}): 0.75,
#frozenset({4}): 0.25,
#frozenset({2}): 0.75,
#frozenset({5}): 0.75})
#
# Loop to get different Set set , It should be a set, so the default de duplication 
# for example : [frozenset({1, 3}), frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
# The second time by [frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 5}), frozenset({2, 4}), frozenset({2, 3}), frozenset({1, 3})]
# third time by [frozenset({1, 2, 5}), frozenset({1, 2, 3})]
def aprioriGen(Lk_1, k):
#
# apriori while Get into 
# Recursively pass in the upper list .
# For the first time on the whole [frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})] Full incoming .
# k The first pass in is 2 , Then increase by degrees 
#
Ck = []
lenLk = len(Lk_1)
for i in range(lenLk):
L1 = list(Lk_1[i])[:k - 2]
#
# The first time it came in, it was in this position List 【:k- 2】 It's empty .
#
L1.sort()
for j in range(i + 1, lenLk):
# front k-2 The items are the same , Merge two sets 
L2 = list(Lk_1[j])[:k - 2]
L2.sort()
if L1 == L2:
Ck.append(Lk_1[i] | Lk_1[j])
#
# Loop to form union 
# [frozenset({1, 3}), frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5})]
# Get each combination , In order to calculate the probability of each combination later . That is, support .
# Loop to get the first occurrence of itself character , Loop matches the combination behind itself , If the first character of the combination and the first character of itself equal Then Union .
#
return Ck
dataset = loadDataSet()
L, supportData = apriori(dataset, minSupport=0.2)

ck: [frozenset({1, 2}), frozenset({1, 5}), frozenset({1, 4}),
frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 4}), frozenset({2,
3}), frozenset({4, 5}), frozenset({3, 5}), frozenset({3, 4})] lk:
[frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 5}),
frozenset({2, 4}), frozenset({2, 3}), frozenset({1, 3})] ck:
[frozenset({1, 2, 5}), frozenset({1, 2, 3}), frozenset({1, 3, 5}),
frozenset({2, 4, 5}), frozenset({2, 3, 5}), frozenset({2, 3, 4})] lk:
[frozenset({1, 2, 5}), frozenset({1, 2, 3})] ck: [frozenset({1, 2, 3,
5})] lk: []

Expand one :

L

[[frozenset({
1}),
frozenset({
2}),
frozenset({
5}),
frozenset({
4}),
frozenset({
3})],
[frozenset({
1, 2}),
frozenset({
1, 5}),
frozenset({
2, 5}),
frozenset({
2, 4}),
frozenset({
2, 3}),
frozenset({
1, 3})],
[frozenset({
1, 2, 5}), frozenset({
1, 2, 3})],
[]]

supportData

{
frozenset({
1}): 0.6666666666666666,
frozenset({
2}): 0.7777777777777778,
frozenset({
5}): 0.2222222222222222,
frozenset({
4}): 0.2222222222222222,
frozenset({
3}): 0.6666666666666666,
frozenset({
1, 2}): 0.4444444444444444,
frozenset({
1, 5}): 0.2222222222222222,
frozenset({
2, 5}): 0.2222222222222222,
frozenset({
2, 4}): 0.2222222222222222,
frozenset({
2, 3}): 0.4444444444444444,
frozenset({
1, 4}): 0.1111111111111111,
frozenset({
1, 3}): 0.4444444444444444,
frozenset({
3, 5}): 0.1111111111111111,
frozenset({
1, 2, 5}): 0.2222222222222222,
frozenset({
1, 2, 3}): 0.2222222222222222,
frozenset({
1, 3, 5}): 0.1111111111111111,
frozenset({
2, 3, 5}): 0.1111111111111111,
frozenset({
1, 2, 3, 5}): 0.1111111111111111}
from numpy import *
# Structural data 
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# Convert all elements to frozenset Type Dictionary , Put it in a list 
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
# Use frozenset In order to use these values as dictionary keys later 
return list(map(frozenset, C1)) # frozenset An immutable set ,set Variable set 
# Filter out collections that do not meet the support level 
# return Frequent itemset list retList Support dictionary for all elements 
# support = ssCnt[key] / numItems
# numItems yes len(D) Result ,D yes data Metadata , Start now load The amount of data .
# Ck It's a different set .
def scanD(D, Ck, minSupport):
ssCnt = {
}
for tid in D:
for can in Ck:
if can.issubset(tid): # Judge can Whether it is tid Of 《 A subset of 》 ( Here we use the subset method to judge the relationship between the two )
if can not in ssCnt: # Count the number of times the value satisfies the subset in the whole record ( Record... In the form of a dictionary ,frozenset As the key )
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = [] # Re record the data values that meet the conditions ( That is, data with support greater than the threshold )
supportData = {
} # Support of each data value 
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData # Exclude elements that do not meet the support level Support of each element 
# Generate all collections that can be combined 
# Frequent itemset list Lk Number of itemset elements k [frozenset({2, 3}), frozenset({3, 5})] -> [frozenset({2, 3, 5})]
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk): # Two layer cycle comparison Lk Each element in the and other elements 
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2] # Convert set to list Post value 
L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort() # Here's an explanation : This function compares two at a time list Before k-2 Elements , If they are the same, then the union set is obtained k Collection of elements 
if L1==L2:
retList.append(Lk[i] | Lk[j]) # Union 
return retList # Returns a list of frequent itemsets Ck
# Functions that encapsulate all steps 
# return All combinations that meet greater than the threshold Set support list 
def apriori(dataSet, minSupport = 0.5):
D = list(map(set, dataSet)) # Convert list records into dictionaries [{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]
C1 = createC1(dataSet) # Transfer each element to frozenset Dictionaries [frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
L1, supportData = scanD(D, C1, minSupport) # Filtering data 
L = [L1]
k = 2
while (len(L[k-2]) > 0): # If there is still a set that meets the support, continue to do correlation analysis 
Ck = aprioriGen(L[k-2], k) # Ck Candidate frequent itemsets 
Lk, supK = scanD(D, Ck, minSupport) # Lk Frequent itemsets 
supportData.update(supK) # Update Dictionary ( Put the new collection : Support added to supportData in )
L.append(Lk)
k += 1 # Each new combination of elements adds only one , therefore k also +1(k Number of elements )
return L, supportData
# Get the encapsulation function of association rules 
def generateRules(L, supportData, minConf=0.7): # supportData It's a dictionary 
bigRuleList = []
for i in range(1, len(L)): # From for 2 Start with a collection of elements 
for freqSet in L[i]:
# A list of collections that contain only a single element 
H1 = [frozenset([item]) for item in freqSet] # frozenset({2, 3}) Convert to [frozenset({2}), frozenset({3})]
# If the set element is greater than 2 individual , You need to process to get the rules 
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # Collection elements Set split list ...
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
# Evaluate the rules Obtain the association rules that satisfy the minimum confidence 
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = [] # Create a new list to return 
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq] # Calculate the confidence level 
if conf >= minConf:
print(freqSet-conseq,'-->',conseq,'conf:',conf)
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# Generate candidate rule set 
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m + 1)): # Try further merging 
Hmp1 = aprioriGen(H, m+1) # Combine single set elements in pairs 
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1): #need at least two sets to merge
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
dataSet = loadDataSet()
L,suppData = apriori(dataSet,minSupport=0.5)
rules = generateRules(L,suppData,minConf=0.7)
# rules = generateRules(L,suppData,minConf=0.5)
print(" The itemsets with strong association rules are :",rules)

Extension 2 :

https://github.com/apachecn/AiLearning/blob/master/src/py2.x/ml/11.Apriori/apriori.py

  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved