-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathBaseClusters.py
More file actions
114 lines (101 loc) · 3.61 KB
/
BaseClusters.py
File metadata and controls
114 lines (101 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import nltk
from nltk.corpus import stopwords
import string, sys
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dateutil.parser import parse as date_parser
import sqlite3, json
import pandas as pd
import numpy as np
from datetime import datetime
from difflib import SequenceMatcher
from sklearn.cluster import KMeans, MiniBatchKMeans
from nltk.corpus import wordnet
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
sia = SentimentIntensityAnalyzer()
def approximate_sentiment(seq1, words_ref):
global sia
lexicon_, syn, ant = [0], [], []
for l in range(2):
for synset in wordnet.synsets(seq1):
for lemma in synset.lemmas():
syn.append(lemma.name())
if lemma.antonyms():
ant.append(lemma.antonyms()[0].name())
tmp_corpus = []
for i in syn:
sent_ = sia.polarity_scores(i)['compound']
if abs(sent_):
tmp_corpus.append(sent_)
if sum([1 for i in tmp_corpus if i>0])>len(tmp_corpus)/2:
return np.mean([i for i in tmp_corpus if i>0])
elif sum([1 for i in tmp_corpus if i<0])>len(tmp_corpus)/2:
return np.mean([i for i in tmp_corpus if i<0])
tmp_corpus = []
for i in ant:
sent_ = sia.polarity_scores(i)['compound']
if abs(sent_):
tmp_corpus.append(sent_)
if sum([1 for i in tmp_corpus if i>0])>len(tmp_corpus)/2:
return -1*np.mean([i for i in tmp_corpus if i>0])
elif sum([1 for i in tmp_corpus if i<0])>len(tmp_corpus)/2:
return -1*np.mean([i for i in tmp_corpus if i<0])
for j in words_ref[:0]:
if j[:2] in ["un", "in"]:
j = j[:2]
lexicon_.append(SequenceMatcher(None, seq1, j).ratio())
if max(lexicon_)>=0.85:
resp_ = words_ref[lexicon_.index(max(lexicon_))]
if resp_[:2] in ["un", "in"]:
seq1 = resp_
return 0
def absolute_distance(seq1):
seq1 = seq1.replace("-","")
try:
ref_ = np.array(range(1,10))
spacial_seq = [string.ascii_lowercase.index(i) for i in (seq1+string.ascii_lowercase[:9])[:9]]
return np.sum((np.array(spacial_seq)-np.array(ref_))**2)
except Exception as e:
return str(e)
def base_df(words_list):
global sia
distance_ = []
vader_sentiment_ = []
words_ref = list(sia.make_lex_dict().keys())
for i in words_list:
distance_.append(absolute_distance(i))
sent_ = sia.polarity_scores(i)['compound']
if not sent_:
sent_ = approximate_sentiment(i, words_ref)
vader_sentiment_.append(sent_)
df = pd.DataFrame(np.array([distance_, vader_sentiment_]).transpose(), index=[i for i in words_list], columns=['distance','sentiment'])
df['distance'] = pd.to_numeric(df['distance'], errors='coerce')
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='coerce')
df['distance'] /= df['distance'].max()
return df.dropna(axis='rows')
if __name__ == "__main__":
words_list = []
with open("assets/50kenglish.txt", "r") as f:
words_list = f.readlines()
words_list = [i.replace("\n","") for i in words_list]
df = base_df(words_list)
df.to_excel("assets/output.xlsx")
distorsions = []
scaler = StandardScaler()
X_std = scaler.fit_transform(df)
batch_ = 500
for k in tqdm(range(2, (batch_*3)-1)):
#kmeans = KMeans(n_clusters=k)
kmeans = MiniBatchKMeans(n_clusters=k, random_state=0, batch_size=batch_)
kmeans.fit(X_std)
distorsions.append(kmeans.inertia_)
with open("distorsions.log", "w") as f:
f.write(json.dumps(distorsions))
from scipy.cluster.vq import kmeans
from kneed import KneeLocator
kn = KneeLocator(range(2, (batch_*3)-1), distorsions, curve='convex', direction='decreasing')
print("Elbow's optimal clusters number: %d"%kn.knee)
features = np.c_[df]
clusters = kmeans(features,kn.knee)
with open("assets/englishClusters.log", "w") as f:
f.write(json.dumps(clusters[0].tolist()))