aboutsummaryrefslogtreecommitdiffstats
path: root/logreg_train.py
diff options
context:
space:
mode:
Diffstat (limited to 'logreg_train.py')
-rw-r--r--logreg_train.py86
1 files changed, 86 insertions, 0 deletions
diff --git a/logreg_train.py b/logreg_train.py
new file mode 100644
index 0000000..bf3fb4b
--- /dev/null
+++ b/logreg_train.py
@@ -0,0 +1,86 @@
+from module.constants import LOGISTIC_REGRESSION_FEATURE_TITLES, HOUSE_FEATURE_CSV_TITLE
+from module.dataset_manip import parse_csv, impute_mean, impute_median, standardize
+from module.math import gradient_descent
+import sys
+
+if len(sys.argv) < 4:
+ print(f"Usage: python {__file__} <dataset.csv> <learning rate> <iterations>")
+ exit(-1)
+
+# Get data from CSV file
+filename = sys.argv[1]
+data_dict = parse_csv(filename, LOGISTIC_REGRESSION_FEATURE_TITLES)
+# Standardize values
+for key in data_dict.keys():
+ data_dict[key] = standardize(data_dict[key])
+# Impute missing values
+for key in data_dict.keys():
+ data_dict[key] = impute_mean(data_dict[key])
+
+# Get the feature matrix
+data_list = list(data_dict.values())
+training_example_count = len(data_list[0])
+feature_matrix = [[] for _ in range(training_example_count)]
+for i in range(len(data_list)):
+ for j in range(training_example_count):
+ feature_matrix[j].append(0)
+ for j in range(len(data_list[i])):
+ feature_matrix[j][i] = data_list[i][j]
+
+# Initialize the weight vector
+weight_vector = [0 for _ in range(len(LOGISTIC_REGRESSION_FEATURE_TITLES) + 1)]
+
+# Get the actual label vectors for each one-vs-all hypothesis
+actual_labels_vectors = {
+ "Gryffindor": [],
+ "Slytherin": [],
+ "Hufflepuff": [],
+ "Ravenclaw": [],
+}
+data_dict = parse_csv(filename, [], [HOUSE_FEATURE_CSV_TITLE])
+for selected_label in actual_labels_vectors.keys():
+ actual_labels_vectors[selected_label] = [
+ 1 if actual_label == selected_label else 0
+ for actual_label in data_dict[HOUSE_FEATURE_CSV_TITLE]
+ ]
+
+# Perform gradient descent for each one-vs-all hypothesis
+learning_rate = float(sys.argv[2])
+iteration_count = int(sys.argv[3])
+weights = {}
+for parameter in actual_labels_vectors.keys():
+ print(f"Training model for hypothesis label = {parameter}")
+ computed_weights = gradient_descent(
+ feature_matrix,
+ weight_vector,
+ actual_labels_vectors[parameter],
+ learning_rate,
+ iteration_count,
+ )
+ print("Computed weights: " + str(computed_weights))
+ weights[parameter] = computed_weights
+
+# Save computed weights to a CSV file
+save_filename = "weights.csv"
+with open(save_filename, "w") as file:
+ # Column headers
+ headers = ["label"] + [h for h in LOGISTIC_REGRESSION_FEATURE_TITLES] + ["bias"]
+ for i in range(len(headers)):
+ file.write(headers[i])
+ if i != len(headers) - 1:
+ file.write(",")
+ else:
+ file.write("\n")
+
+ # Rows
+ labels = list(actual_labels_vectors.keys())
+ for i in range(len(labels)):
+ file.write(labels[i] + ",")
+ for j in range(len(weights[labels[i]])):
+ file.write(str(weights[labels[i]][j]))
+ if j != len(weights[labels[i]]) - 1:
+ file.write(",")
+ else:
+ file.write("\n")
+
+ print(f"Computed weights saved to {save_filename}")