-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassify_category.R
More file actions
68 lines (54 loc) · 2.78 KB
/
classify_category.R
File metadata and controls
68 lines (54 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#-------------------------------------------------------------------------------------------------------------
#Classification Algorithim on Bayesian Methods
classify_category <- function(textColumns,algorithm="bayes",pstrong=1.0,pweak=1.0,prior=1.0,verbose=FALSE,...) {
matrix <- create_matrix(textColumns,...)
lexicon <- read.csv(file="data/CategoryDictionary.csv",header=FALSE) #dictionary
counts <- list(roads=length(which(lexicon[,3]=="roads")),electricity=length(which(lexicon[,3]=="electricity")),water=length(which(lexicon[,3]=="water")),traffic=length(which(lexicon[,3]=="traffic")),total=nrow(lexicon)) #listing the dictionary words
documents <- c()
for (i in 1:nrow(matrix)) {
if (verbose) print(paste("DOCUMENT",i))
scores <- list(water=0,electricity=0, roads=0, traffic=0)
doc <- matrix[i,]
words <- findFreqTerms(doc,lowfreq=1)
for (word in words) {
index <- match(word,lexicon[,1], nomatch=0)
if (index > 0) {
entry <- lexicon[index,]
polarity <- as.character(entry[[2]])
category <- as.character(entry[[3]])
count <- counts[[category]]
score <- pweak
if (polarity == "strongsubj") score <- pstrong
if (algorithm=="bayes") score <- abs(log(score*prior/count))
scores[[category]] <- scores[[category]]+score
}
}
if (algorithm=="bayes") { #The magic happens here
for (key in names(scores)) {
count <- counts[[key]]
total <- counts[["total"]]
score <- abs(log(count/total))
scores[[key]] <- scores[[key]]+score
}
} else { #when you don't want the magic to happen
for (key in names(scores)) {
scores[[key]] <- scores[[key]]+0.000001
}
}
best_fit <- names(scores)[which.max(unlist(scores))] #assigning final score
ratio <- abs(scores$roads/scores$electricity/scores$water/scores$traffic)
if(isTRUE(all.equal(ratio,0.495830354533587)))best_fit <-"miscellaneous" # score value of all miscellaneous predictions
tweets.df=ldply(some_tweets1[i], function(t) t$toDataFrame())
screenname<-tweets.df$screenName
documents <- rbind(documents,c(scores$roads,scores$electricity,scores$water,scores$traffic,abs(scores$roads/scores$electricity/scores$water/scores$traffic),best_fit, screenname)) #consolidating and assigning scores
if (verbose) {
print(paste("ROADS:",scores$water,"ELECTRICITY:",scores$electricity,"WATER",scores$water,"TRAFFIC:",scores$traffic))
cat("\n")
}
}
colnames(documents) <- c("ROADS","ELECTRICITY","WATER","TRAFFIC","RATIO","BEST_FIT","SCREENNAME")
return(documents)
return()
}
#-------------------------------------------------------------------------------------------------------------
# R-Complaint-Classifier