Concurrent-KNN/knn-omp.cpp at master · Desousak/Concurrent-KNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
 #include <fstream>
#include <sstream>
#include <iostream>
#include <map>
#include <cmath>
#include <string>
#include <vector>
#include <omp.h>

// The file to read data from
#define data_file "edited.csv"
#define test_file "test.csv"

// define threads
#define NUM_THREADS 4

// Global Variables
std::string used_columns[] = {"country", "year", "age", "sex", "gdp_for_year", "suicides/100k"};
const int column_amount = 6;

struct neighbors {
    std::string value;
    double distance;
};

// Functions
std::map<std::string, std::vector<std::string> > load_data(std::string filename);
std::vector<std::string> delimit_string(std::string line, char delimiter);
std::string knn(int k, std::map<std::string, std::vector<std::string> > table_data, std::string user_input[column_amount]);
int find_largest_index(int k, struct neighbors closest_neighbors[], double distance);

int main(){
    // Load the data
    std::map<std::string, std::vector<std::string> > table_data = load_data(data_file);
    std::map<std::string, std::vector<std::string> > test_data = load_data(test_file);

    // The amount of values tested correctly
    int correct = 0;
    // Iterate through the test values
    for(int i = 0; i < test_data.at(used_columns[0]).size(); i++){
        // Get the answer (first column)
        std::string value = test_data.at(used_columns[0])[i];

        // Get the other inputs (other columns)
        int input_col_size = sizeof(used_columns) / sizeof(used_columns[0]);
        std::string tmp_input[input_col_size - 2];
        for(int j = 1; j < input_col_size - 1; j++){
            tmp_input[j - 1] = test_data.at(used_columns[j])[i];
        }

        // Check if the result of knn is the same as the real answer
        if(value == knn(5, table_data, tmp_input)){
            correct += 1;
        }
    }

    std::cout << "ACCURACY: " << (float)correct / test_data.at(used_columns[0]).size() << " @ K = " << 5 << std::endl;


}

std::string knn(int k, std::map<std::string, std::vector<std::string> > table_data, std::string user_input[column_amount]){

    omp_set_num_threads(NUM_THREADS);

    // used to store the results per thread
    struct neighbors closest_neighbors[k];

    struct neighbors thread_neighbors[NUM_THREADS][k];

    #pragma omp parallel
    {

        int col_amount = sizeof(used_columns) / sizeof(used_columns[0]);
        int row_amount = table_data.at(used_columns[0]).size();

        int temp_thread_count = omp_get_num_threads();
        int thread_ID = omp_get_thread_num();

        // assign tasks to different threads
        for(int i = thread_ID; i < row_amount; i+= NUM_THREADS){
            double distance = 0.0f;
            for (int j = 1; j < col_amount-1; j++){
                double tmp = stod(table_data.at(used_columns[j])[i]) - stod(user_input[j - 1]);
                distance += tmp * tmp;
            }
            // Calculate the euclidean distance
            distance = sqrt(distance);

            // If the suicide per 100k value is 0, dont consider it at all
            if(stod(table_data.at(used_columns[col_amount-1])[i]) != 0){
                // Factor in the suicides/100k rate into the suicide possibility
                distance = distance / stod(table_data.at(used_columns[col_amount-1])[i]);

                // Select smallest here
                int place_index = find_largest_index(k, thread_neighbors[thread_ID], distance);
                if(place_index != -1){
                    thread_neighbors[thread_ID][place_index].value = table_data.at(used_columns[0])[i];
                    thread_neighbors[thread_ID][place_index].distance = distance;
                }
            }
        }


    // Method 1: Uses omp critical to update the closest_neighbors array.
    // #pragma omp critical
    // {
    //     for(int i = 0; i < k; i++){
    //         int place_index = find_largest_index(k, closest_neighbors, thread_neighbors[i].distance);
    //         if(place_index != -1){
    //             closest_neighbors[place_index].value =  thread_neighbors[i].value;
    //             closest_neighbors[place_index].distance = thread_neighbors[i].distance;
    //         }
    //     }
    // }
    }

    // Method 2: Instead of using omp critical, threads store results in array concurrently
    for(int i = 0; i < NUM_THREADS; i++){
        for(int j = 0; j < k; j++){
            int place_index = find_largest_index(k, closest_neighbors, thread_neighbors[i][j].distance);
            if(place_index != -1){
                closest_neighbors[place_index].value =  thread_neighbors[i][j].value;
                closest_neighbors[place_index].distance = thread_neighbors[i][j].distance;
            }
        }
    }


    // The index of the majority value
    int majority_index = -1;
    // The distance of the smallest value
    double smallest_avg_distance = -1.0f;

    // Iterate through countries
    for(int i = 0; i < k; i++){
        // Find the amount of time a country occurs
        int occurance = 0;
        double avg_distance = 0.0f;
        for(int j = 0; j < k; j++){
            if(closest_neighbors[i].value == closest_neighbors[j].value){
                occurance += 1;
                avg_distance += closest_neighbors[j].distance;
            }
        }
        // Calculate the average distance
        avg_distance = avg_distance / occurance;

        // If its the smallest average distance, select it
        if(avg_distance < smallest_avg_distance || smallest_avg_distance == -1.0f){
            smallest_avg_distance = avg_distance;
            majority_index = i;
        }
    }
    return closest_neighbors[majority_index].value;
}

// Find the value to replace in the closest neighbors array
int find_largest_index(int k, struct neighbors closest_neighbors[], double distance){
    int index = -1;
    for(int i = 0; i < k; i++){
        if(closest_neighbors[i].value == ""){
            index = i;
            break;
        } else if(distance < closest_neighbors[i].distance && (index == -1 || closest_neighbors[i].distance > closest_neighbors[index].distance)){
            index = i;
        }
    }
    return index;
}

// Places csv data into a map
std::map<std::string, std::vector<std::string> > load_data(
    std::string filename) {
    // The input stream from the file
    std::ifstream data_stream;
    // Open the file
    data_stream.open(filename);

    // The map to hold all of the data
    std::map<std::string, std::vector<std::string> > table_data;
    // The tmp string to hold the line
    std::string row;
    // Read the column row
    std::getline(data_stream, row);
    // Delimit the row
    std::vector<std::string> columns = delimit_string(row, ',');

    // Initalize the map
    for(std::string column : columns){
        // Create the vector to hold the column's data
        std::vector<std::string> column_vector;
        // Insert into the map
        table_data.insert(make_pair(column, column_vector));
    }

    // Fill the columns
    while (!data_stream.eof()) {
        std::getline(data_stream, row);

        // Prevent if the line is empty
        if(row != ""){
            // Delimit the row
            std::vector<std::string> row_data = delimit_string(row, ',');

            int index = 0;
            // Insert the data into the map
            for(std::string column : columns){
                table_data.at(column).push_back(row_data[index]);
                index++;
            }
        }
    }

    // Close the stream and return
    data_stream.close();

    return table_data;
}

// Delimites a string into a vector
std::vector<std::string> delimit_string(std::string line, char delimiter){
    // A vector to hold the data
    std::vector<std::string> delimited_data;

    // Use a stringstream to split that data
    std::stringstream line_stream(line);

    // A string to hold the final column name
    std::string delimited_val;
    // A temp char to hold the character from the string
    char tmp;

    while(line_stream >> tmp){
        delimited_val += tmp;

        // If there is a comma next
        if(line_stream.peek() == delimiter || line_stream.peek() == 13 || line_stream.peek() == -1){
            // Ignore it
            line_stream.ignore();

            // Add string to vector
            delimited_data.push_back(delimited_val);

            // Reset the column name variable
            delimited_val = "";
        } else if((int)line_stream.peek() == 32){
            // Add the space manually since it is not done
            delimited_val += " ";
        }
    }

    return delimited_data;
}