-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
executable file
·224 lines (184 loc) · 8.55 KB
/
main.py
File metadata and controls
executable file
·224 lines (184 loc) · 8.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
################################################################################
#
# Date Name Description
#
# 01-Mar-2017 Anurag Dixit Initial Draft
# 19-Mar-2017 Anurag Dixit Added API for conditional probabilities
# 20-Mar-2017 Anurag Dixit Bug fix for parsing of data correctly
# 20-Mar-2017 Anurag Dixit Added changes for Bayesian Model incorporation and Data
# 20-Mar-2017 Anurag Dixit Added file read for query perform and commented the MPLP
# 21-Mar-2017 Pavan Joshi Depreciated reduceDimensions function to utilize numpy functions
# 21-Mar-2017 Pavan Joshi Added API to handle nodes in the network.
# 22-Mar-2017 Anurag Dixit Added Linear Regression Code for adding cpds for continuos variables
# 24-Mar-2017 Anurag Dixit Changes done for hybrid bayesian network model compatible data generation
# 24-Mar-2017 Anurag Dixit Changes for Linear Regression intercept
# 24-Mar-2017 Pavan Joshi Adding API to Create a Hybrid Bayesian Network using libpgm
# 25-Mar-2017 Anurag Dixit Added API for calculation of entropy and KL Divergence
# 26-Mar-2017 Pavan Joshi Added API to get independencies in the model
# 27-Mar-2017 Pavan Joshi Added API to handle inference queries and evaluation metrics
#
################################################################################
import os
import csv
import json
import numpy as np
from pgmpy.models import BayesianModel
from libpgm.hybayesiannetwork import HyBayesianNetwork
from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.sampleaggregator import SampleAggregator
from sklearn import linear_model
from construct_graph import Ndata
from Structure import Structure
from Data import Data
import scipy.stats
class BNetwork(Ndata):
def __init__(self, fname):
Ndata.__init__(self, fname)
self.fname = fname
self.model = Structure()
self.independencies_model = BayesianModel()
def define_structure(self):
print "Constructing the Hybrid Bayesian Network Model graph ... "
self.model.add_edge(['pageCategory','pagePopularity'],types=['d','lgandd'])
self.model.add_edge(['pagePopularity','pageTalkingAbout'],types=['lgandd','lg'])
self.model.add_edge(['pageTalkingAbout','Comments'],types=['lg','lgandd'])
self.model.add_edge(['postPromotion','Comments'],types=['d','lgandd'])
self.model.add_edge(['postLength','postShareCt'],types=['lg','lg'])
self.model.add_edge(['postLength','Comments'],types=['lg','lgandd'])
self.model.add_edge(['postShareCt','Comments'],types=['lg','lgandd'])
self.model.add_edge(['baseDay','cc2'],types=['d','lgandd'])
self.model.add_edge(['cc1','cc2'],types=['lg','lgandd'])
self.model.add_edge(['cc2','cc3'],types=['lgandd','lg'])
self.model.add_edge(['cc3','Comments'],types=['lg','lgandd'])
self.model.add_edge(['pageCheckins','Comments'],types=['lg','lgandd'])
self.model.add_edge(['postDay','cc4'],types=['d','lgandd'])
self.model.add_edge(['cc4','Comments'],types=['lgandd','lgandd'])
self.independencies_model.add_edge('pageCategory','pagePopularity')
self.independencies_model.add_edge('pagePopularity','pageTalkingAbout')
self.independencies_model.add_edge('pageTalkingAbout','Comments')
self.independencies_model.add_edge('postPromotion','Comments')
self.independencies_model.add_edge('postLength','postShareCt')
self.independencies_model.add_edge('postLength','Comments')
self.independencies_model.add_edge('postShareCt','Comments')
self.independencies_model.add_edge('baseDay','cc2')
self.independencies_model.add_edge('cc1','cc2')
self.independencies_model.add_edge('cc2','cc3')
self.independencies_model.add_edge('cc3','Comments')
self.independencies_model.add_edge('pageCheckins','Comments')
self.independencies_model.add_edge('postDay','cc4')
self.independencies_model.add_edge('cc4','Comments')
with open("structure.json","wb") as json_file:
json_file.write(self.model.get_structure())
DISCRETE = "d"
LINEARGAUSSIAN = "lg"
LGANDDISCRETE = "lgandd"
print "Calculating CPDs compatible to Hybrid Bayesian Network Model for Hybrid Data ... "
dat = {"pageCategory": self.get_value("pageCategory", DISCRETE),
"pagePopularity": self.get_value("pagePopularity", LGANDDISCRETE),
"pageTalkingAbout": self.get_value("pageTalkingAbout", LINEARGAUSSIAN),
"postPromotion": self.get_value("postPromotion", LINEARGAUSSIAN),
"postLength": self.get_value("postLength", LINEARGAUSSIAN),
"postShareCt": self.get_value("postShareCt", LINEARGAUSSIAN),
"baseDay": self.get_value("baseDay", DISCRETE),
"cc1": self.get_value("cc1", LINEARGAUSSIAN),
"cc2": self.get_value("cc2", LGANDDISCRETE),
"cc3": self.get_value("cc3", LINEARGAUSSIAN),
"postDay": self.get_value("postDay", DISCRETE),
"pageCheckins": self.get_value("pageCheckins", LINEARGAUSSIAN),
"cc4": self.get_value("cc4", LGANDDISCRETE),
"Comments": self.get_value("Comments", LINEARGAUSSIAN),
}
self.node_data = {"Vdata": dat}
with open("nodedata.json","wb") as json_file:
json_file.write(json.dumps(self.node_data, indent=2))
def get_independencies(self,variables=None):
if variables == None:
return self.independencies_model.get_independencies()
return self.independencies_model.local_independencies(variables)
def probability_query(self,query,evidence = {}):
prob = 1
if len(evidence.keys()) == 0:
samples = self.bayesian_network.randomsample(1000)
else:
samples = self.bayesian_network.randomsample(1000,evidence)
aggregate = self.aggregator.aggregate(samples)
for key in query.keys():
valDict = aggregate[key]
if(self.model.get_vertex_type(key)!="d"):
values = np.array(valDict.keys(),dtype=np.float)
dist = scipy.stats.norm(np.mean(values),np.var(values))
prob *= dist.pdf(query[key])
else:
prob *= float(valDict[str(query[key])])
return prob
def infer(self):
f = open('query.txt', 'r')
lines = f.readlines()
for i in lines:
lst = i.strip().split(", ")
queryType = lst[0]
if(queryType == 'I'):
print "\n\n########## Printing all independencies ##########\n"
print self.get_independencies()
elif(queryType == 'LI'):
var = lst[1].strip("[").strip("]").split(" & ")
print "\n\n######### Local independencies for",var," #########\n"
print self.get_independencies(variables = var)
elif(queryType == 'CP'):
evidences = dict()
query = dict()
args = lst[1].split("=")
query[args[0].strip().strip(" ")] = float(args[1].strip().strip(" "))
args = lst[2].split(" -> ")
for arg in args[1].strip("[").strip("]").split("&"):
evid = arg.strip().strip(" ").split("=")
evidences[evid[0].strip().strip(" ")] = float(evid[1].strip().strip(" "))
print "\n\n########## P(",lst[1],"|",lst[2].split(" -> ")[1],") ##########\n"
print self.probability_query(query,evidences)
elif(queryType == 'M'):
query = dict()
args = lst[1].split("=")
query[args[0].strip().strip(" ")] = float(args[1].strip().strip(" "))
print "\n\n########## P(",lst[1],") ##########\n"
print self.probability_query(query)
#TODO: Add handling of multiple types of queries defined in query file
def metric(self, a, b):
#Make sure the input parameters are probability Distributions
entropy = scipy.stats.entropy(a)
kl_divergence = scipy.stats.entropy(a, b)
return entropy, kl_divergence
def create_network(self):
skeleton = GraphSkeleton()
skeleton.load("structure.json")
ndata = NodeData()
ndata.load("nodedata.json")
skeleton.toporder()
ndata.entriestoinstances()
self.bayesian_network = HyBayesianNetwork(skeleton,ndata)
self.aggregator = SampleAggregator()
def evaluation_metrics(self):
result = self.bayesian_network.randomsample(1000)
aggregate = self.aggregator.aggregate(result)['Comments']
samples = np.array(aggregate.keys(),dtype=np.float)
samplepdf = scipy.stats.norm(np.mean(samples),np.var(samples))
origin_data = np.array(self.target,dtype=np.float)
originalpdf = scipy.stats.norm(np.mean(origin_data),np.var(origin_data))
query_domain = np.linspace(np.mean(origin_data) - np.var(origin_data),
np.mean(origin_data) + np.var(origin_data), 100)
entropy, kl_divergence = self.metric(samplepdf.pdf(query_domain),originalpdf.pdf(query_domain))
print "\n\n########## Performance Evaluation Metrics ##########\n"
print "Mean:",samplepdf.mean()
print "Entropy:",entropy
print "KL Divergence:",kl_divergence
if __name__=="__main__":
fname = []
dirname = "Training/"
for files in os.listdir(dirname):
if(files.endswith(".csv")):
fname.append(dirname + files)
bn = BNetwork(fname)
bn.define_structure()
bn.create_network()
bn.infer()
bn.evaluation_metrics()