1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
from module.constants import LOGISTIC_REGRESSION_FEATURE_TITLES, HOUSE_FEATURE_CSV_TITLE
from module.dataset_manip import parse_csv, impute_mean, impute_median, standardize
from module.math import gradient_descent
import sys
if len(sys.argv) < 4:
print(f"Usage: python {__file__} <dataset.csv> <learning rate> <iterations>")
exit(-1)
# Get data from CSV file
filename = sys.argv[1]
data_dict = parse_csv(filename, LOGISTIC_REGRESSION_FEATURE_TITLES)
# Standardize values
for key in data_dict.keys():
data_dict[key] = standardize(data_dict[key])
# Impute missing values
for key in data_dict.keys():
data_dict[key] = impute_mean(data_dict[key])
# Get the feature matrix
data_list = list(data_dict.values())
training_example_count = len(data_list[0])
feature_matrix = [[] for _ in range(training_example_count)]
for i in range(len(data_list)):
for j in range(training_example_count):
feature_matrix[j].append(0)
for j in range(len(data_list[i])):
feature_matrix[j][i] = data_list[i][j]
# Initialize the weight vector
weight_vector = [0 for _ in range(len(LOGISTIC_REGRESSION_FEATURE_TITLES) + 1)]
# Get the actual label vectors for each one-vs-all hypothesis
actual_labels_vectors = {
"Gryffindor": [],
"Slytherin": [],
"Hufflepuff": [],
"Ravenclaw": [],
}
data_dict = parse_csv(filename, [], [HOUSE_FEATURE_CSV_TITLE])
for selected_label in actual_labels_vectors.keys():
actual_labels_vectors[selected_label] = [
1 if actual_label == selected_label else 0
for actual_label in data_dict[HOUSE_FEATURE_CSV_TITLE]
]
# Perform gradient descent for each one-vs-all hypothesis
learning_rate = float(sys.argv[2])
iteration_count = int(sys.argv[3])
weights = {}
for parameter in actual_labels_vectors.keys():
print(f"Training model for hypothesis label = {parameter}")
computed_weights = gradient_descent(
feature_matrix,
weight_vector,
actual_labels_vectors[parameter],
learning_rate,
iteration_count,
)
print("Computed weights: " + str(computed_weights))
weights[parameter] = computed_weights
# Save computed weights to a CSV file
save_filename = "weights.csv"
with open(save_filename, "w") as file:
# Column headers
headers = ["label"] + [h for h in LOGISTIC_REGRESSION_FEATURE_TITLES] + ["bias"]
for i in range(len(headers)):
file.write(headers[i])
if i != len(headers) - 1:
file.write(",")
else:
file.write("\n")
# Rows
labels = list(actual_labels_vectors.keys())
for i in range(len(labels)):
file.write(labels[i] + ",")
for j in range(len(weights[labels[i]])):
file.write(str(weights[labels[i]][j]))
if j != len(weights[labels[i]]) - 1:
file.write(",")
else:
file.write("\n")
print(f"Computed weights saved to {save_filename}")
|