diff options
Diffstat (limited to 'logreg_train.py')
| -rw-r--r-- | logreg_train.py | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/logreg_train.py b/logreg_train.py new file mode 100644 index 0000000..bf3fb4b --- /dev/null +++ b/logreg_train.py @@ -0,0 +1,86 @@ +from module.constants import LOGISTIC_REGRESSION_FEATURE_TITLES, HOUSE_FEATURE_CSV_TITLE +from module.dataset_manip import parse_csv, impute_mean, impute_median, standardize +from module.math import gradient_descent +import sys + +if len(sys.argv) < 4: + print(f"Usage: python {__file__} <dataset.csv> <learning rate> <iterations>") + exit(-1) + +# Get data from CSV file +filename = sys.argv[1] +data_dict = parse_csv(filename, LOGISTIC_REGRESSION_FEATURE_TITLES) +# Standardize values +for key in data_dict.keys(): + data_dict[key] = standardize(data_dict[key]) +# Impute missing values +for key in data_dict.keys(): + data_dict[key] = impute_mean(data_dict[key]) + +# Get the feature matrix +data_list = list(data_dict.values()) +training_example_count = len(data_list[0]) +feature_matrix = [[] for _ in range(training_example_count)] +for i in range(len(data_list)): + for j in range(training_example_count): + feature_matrix[j].append(0) + for j in range(len(data_list[i])): + feature_matrix[j][i] = data_list[i][j] + +# Initialize the weight vector +weight_vector = [0 for _ in range(len(LOGISTIC_REGRESSION_FEATURE_TITLES) + 1)] + +# Get the actual label vectors for each one-vs-all hypothesis +actual_labels_vectors = { + "Gryffindor": [], + "Slytherin": [], + "Hufflepuff": [], + "Ravenclaw": [], +} +data_dict = parse_csv(filename, [], [HOUSE_FEATURE_CSV_TITLE]) +for selected_label in actual_labels_vectors.keys(): + actual_labels_vectors[selected_label] = [ + 1 if actual_label == selected_label else 0 + for actual_label in data_dict[HOUSE_FEATURE_CSV_TITLE] + ] + +# Perform gradient descent for each one-vs-all hypothesis +learning_rate = float(sys.argv[2]) +iteration_count = int(sys.argv[3]) +weights = {} +for parameter in actual_labels_vectors.keys(): + print(f"Training model for hypothesis label = {parameter}") + computed_weights = gradient_descent( + feature_matrix, + weight_vector, + actual_labels_vectors[parameter], + learning_rate, + iteration_count, + ) + print("Computed weights: " + str(computed_weights)) + weights[parameter] = computed_weights + +# Save computed weights to a CSV file +save_filename = "weights.csv" +with open(save_filename, "w") as file: + # Column headers + headers = ["label"] + [h for h in LOGISTIC_REGRESSION_FEATURE_TITLES] + ["bias"] + for i in range(len(headers)): + file.write(headers[i]) + if i != len(headers) - 1: + file.write(",") + else: + file.write("\n") + + # Rows + labels = list(actual_labels_vectors.keys()) + for i in range(len(labels)): + file.write(labels[i] + ",") + for j in range(len(weights[labels[i]])): + file.write(str(weights[labels[i]][j])) + if j != len(weights[labels[i]]) - 1: + file.write(",") + else: + file.write("\n") + + print(f"Computed weights saved to {save_filename}") |
