Concurrent-KNN/knn-pt.cpp at master · Desousak/Concurrent-KNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#include <fstream>
#include <sstream>
#include <iostream>
#include <map>
#include <cmath>
#include <string>
#include <vector>
#include <pthread.h>

// How many neighbors to consider
#define K 5

// The file to read data from
#define data_file "edited.csv"
#define test_file "test.csv"

// The # of threads used
#define MAX_THREADS 8

// The columns to take into account and its size
const int column_amount = 6;
std::string used_columns[] = {"country", "year", "age", "sex", "gdp_for_year", "suicides/100k"};

// All of the data extracted from a csv
std::map<std::string, std::vector<std::string> > table_data;

// A struct to define a neighbor
struct neighbors {
    std::string group; // The classification of a value
    double distance; // The distance from the value to the input
};

// A struct containing all of the data needed for a thread
struct thread_data {
    int thread_id; // The ID of a thread
    std::string user_input[column_amount]; // The input passed
    struct neighbors closest_neighbors[K]; // A place to store the neighbors
};

// Define functions used
void* find_closest(void* data);
std::string knn(std::string user_input[column_amount]);
int find_largest_index(struct neighbors closest_neighbors[], double distance);
std::map<std::string, std::vector<std::string> > load_data(std::string filename);
std::vector<std::string> delimit_string(std::string line, char delimiter);

int main(){
    // Load the data
    table_data = load_data(data_file);
    std::map<std::string, std::vector<std::string> > test_data = load_data(test_file);

    // The amount of values tested correctly
    int correct = 0;

    // Iterate through the test values
    for(int i = 0; i < test_data.at(used_columns[0]).size(); i++){
        // Get the answer (first column)
        std::string value = test_data.at(used_columns[0])[i];

        // Get the other inputs (other columns)
        int input_col_size = sizeof(used_columns) / sizeof(used_columns[0]);
        std::string tmp_input[input_col_size - 2];
        for(int j = 1; j < input_col_size - 1; j++){
            tmp_input[j - 1] = test_data.at(used_columns[j])[i];
        }

        // Check if the result of knn is the same as the real answer
        if(value == knn(tmp_input)){
            correct += 1;
        }
    }

    std::cout << "ACCURACY: " << (float)correct / test_data.at(used_columns[0]).size() << " @ K = " << K << std::endl;
}

std::string knn(std::string user_input[column_amount]){
    // Define the array to hold the threads
    pthread_t threads[MAX_THREADS];
    // Define the attribute for the threads (mainly joinable)
    pthread_attr_t thread_attr;
    // Define the array to hold the variables for each thread
    struct thread_data data[MAX_THREADS];

    // Initalize the attribute and set it to JOINABLE
    pthread_attr_init(&thread_attr);
    pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);

    for(int thread_id = 0; thread_id < MAX_THREADS; thread_id++){
        // Set the thread id
        data[thread_id].thread_id = thread_id;
        // Copy the user's data
        for(int i = 0; i < column_amount - 2; i++){
            data[thread_id].user_input[i] = user_input[i];
        }

        // Run the thread
        pthread_create(&threads[thread_id], &thread_attr, find_closest, (void*)&data[thread_id]);
    }

    // Remove the attribute to free memory
    pthread_attr_destroy(&thread_attr);
    // Define the final K closest neighbors
    struct neighbors final_closest_neighbors[K];

    for (int thread_id = 0; thread_id < MAX_THREADS; thread_id++) {
        // Join all threads back
        pthread_join(threads[thread_id], NULL);

        // Determine what neighbors to keep
        for(int i = 0; i < K; i++){
            // Select smallest here
            int place_index = find_largest_index(final_closest_neighbors, data[thread_id].closest_neighbors[i].distance);

            // Swap neighbors
            if(place_index != -1){
                final_closest_neighbors[place_index].group = data[thread_id].closest_neighbors[i].group;
                final_closest_neighbors[place_index].distance = data[thread_id].closest_neighbors[i].distance;
            }
        }
    }

    // The index of the majority value
    int majority_index = -1;
    // The distance of the smallest value
    double smallest_avg_distance = -1.0f;

    // TODO: Write a better search
    // Iterate through countries
    for(int i = 0; i < K; i++){
        // Find the amount of time a country occurs
        int occurance = 0;
        double avg_distance = 0.0f;
        for(int j = 0; j < K; j++){
            if(final_closest_neighbors[i].group == final_closest_neighbors[j].group){
                occurance += 1;
                avg_distance += final_closest_neighbors[j].distance;
            }
        }
        // Calculate the average distance
        avg_distance = avg_distance / occurance;

        // If its the smallest average distance, select it
        if(avg_distance < smallest_avg_distance || smallest_avg_distance == -1.0f){
            smallest_avg_distance = avg_distance;
            majority_index = i;
        }
    }

    // Print the results - DEBUG USE
    // for(int i = 0; i < K; i++){
    //     std::cout << final_closest_neighbors[i].group << " " << final_closest_neighbors[i].distance << std::endl;
    // }
    return final_closest_neighbors[majority_index].group;
}

void* find_closest(void* data){
    // Get the thread's data
    struct thread_data* parameters = (struct thread_data*) data;

    // Get the input data's dimensions
    int col_amount = sizeof(used_columns) / sizeof(used_columns[0]);
    int row_amount = table_data.at(used_columns[0]).size();

    // Iterate through the data
    for(int i = parameters->thread_id; i < row_amount; i+=MAX_THREADS){
        double distance = 0.0f;

        // Dont use the last column, used later
        for (int j = 1; j < col_amount - 1; j++){
            double tmp = stod(table_data.at(used_columns[j])[i]) - stod(parameters->user_input[j - 1]);
            distance += tmp * tmp;
        }
        // Calculate the euclidean distance
        distance = sqrt(distance);

        // If the suicide per 100k value (last column) is 0, dont consider it at all
        if(stod(table_data.at(used_columns[col_amount-1])[i]) != 0){
             // Factor in the suicides/100k rate into the suicide possibility
            distance = distance / stod(table_data.at(used_columns[col_amount-1])[i]);

            // Find the value to swap in the array
            int place_index = find_largest_index(parameters->closest_neighbors, distance);

            if(place_index != -1){
                parameters->closest_neighbors[place_index].group = table_data.at(used_columns[0])[i];
                parameters->closest_neighbors[place_index].distance = distance;
            }
        }
    }
}

// Find the value to replace in the closest neighbors array
int find_largest_index(struct neighbors closest_neighbors[], double distance){
    int index = -1;
    for(int i = 0; i < K; i++){
        if(closest_neighbors[i].group == ""){
            index = i;
            break;
        } else if(distance < closest_neighbors[i].distance && (index == -1 || closest_neighbors[i].distance > closest_neighbors[index].distance)){
            index = i;
        }
    }
    return index;
}

// Places csv data into a map
std::map<std::string, std::vector<std::string> > load_data(
    std::string filename) {
    // The input stream from the file
    std::ifstream data_stream;
    // Open the file
    data_stream.open(filename);

    // The map to hold all of the data
    std::map<std::string, std::vector<std::string> > table_data;
    // The tmp string to hold the line
    std::string row;
    // Read the column row
    std::getline(data_stream, row);
    // Delimit the row
    std::vector<std::string> columns = delimit_string(row, ',');

    // Initalize the map
    for(std::string column : columns){
        // Create the vector to hold the column's data
        std::vector<std::string> column_vector;
        // Insert into the map
        table_data.insert(make_pair(column, column_vector));
    }

    // Fill the columns
    while (!data_stream.eof()) {
        std::getline(data_stream, row);

        // Prevent if the line is empty
        if(row != ""){
            // Delimit the row
            std::vector<std::string> row_data = delimit_string(row, ',');

            int index = 0;
            // Insert the data into the map
            for(std::string column : columns){
                table_data.at(column).push_back(row_data[index]);
                index++;
            }
        }
    }

    // Close the stream and return
    data_stream.close();

    return table_data;
}

// Delimites a string into a vector
std::vector<std::string> delimit_string(std::string line, char delimiter){
    // A vector to hold the data
    std::vector<std::string> delimited_data;

    // Use a stringstream to split that data
    std::stringstream line_stream(line);

    // A string to hold the final column name
    std::string delimited_val;
    // A temp char to hold the character from the string
    char tmp;

    while(line_stream >> tmp){
        delimited_val += tmp;

        // If there is a comma next
        if(line_stream.peek() == delimiter || line_stream.peek() == 13 || line_stream.peek() == -1){
            // Ignore it
            line_stream.ignore();

            // Add string to vector
            delimited_data.push_back(delimited_val);

            // Reset the column name variable
            delimited_val = "";
        } else if((int)line_stream.peek() == 32){
            // Add the space manually since it is not done
            delimited_val += " ";
        }
    }

    return delimited_data;
}