from module.constants import LOGISTIC_REGRESSION_FEATURE_TITLES, HOUSE_FEATURE_CSV_TITLE from module.dataset_manip import parse_csv, impute_mean, impute_median, standardize from module.math import gradient_descent import sys if len(sys.argv) < 4: print(f"Usage: python {__file__} ") exit(-1) # Get data from CSV file filename = sys.argv[1] data_dict = parse_csv(filename, LOGISTIC_REGRESSION_FEATURE_TITLES) # Standardize values for key in data_dict.keys(): data_dict[key] = standardize(data_dict[key]) # Impute missing values for key in data_dict.keys(): data_dict[key] = impute_mean(data_dict[key]) # Get the feature matrix data_list = list(data_dict.values()) training_example_count = len(data_list[0]) feature_matrix = [[] for _ in range(training_example_count)] for i in range(len(data_list)): for j in range(training_example_count): feature_matrix[j].append(0) for j in range(len(data_list[i])): feature_matrix[j][i] = data_list[i][j] # Initialize the weight vector weight_vector = [0 for _ in range(len(LOGISTIC_REGRESSION_FEATURE_TITLES) + 1)] # Get the actual label vectors for each one-vs-all hypothesis actual_labels_vectors = { "Gryffindor": [], "Slytherin": [], "Hufflepuff": [], "Ravenclaw": [], } data_dict = parse_csv(filename, [], [HOUSE_FEATURE_CSV_TITLE]) for selected_label in actual_labels_vectors.keys(): actual_labels_vectors[selected_label] = [ 1 if actual_label == selected_label else 0 for actual_label in data_dict[HOUSE_FEATURE_CSV_TITLE] ] # Perform gradient descent for each one-vs-all hypothesis learning_rate = float(sys.argv[2]) iteration_count = int(sys.argv[3]) weights = {} for parameter in actual_labels_vectors.keys(): print(f"Training model for hypothesis label = {parameter}") computed_weights = gradient_descent( feature_matrix, weight_vector, actual_labels_vectors[parameter], learning_rate, iteration_count, ) print("Computed weights: " + str(computed_weights)) weights[parameter] = computed_weights # Save computed weights to a CSV file save_filename = "weights.csv" with open(save_filename, "w") as file: # Column headers headers = ["label"] + [h for h in LOGISTIC_REGRESSION_FEATURE_TITLES] + ["bias"] for i in range(len(headers)): file.write(headers[i]) if i != len(headers) - 1: file.write(",") else: file.write("\n") # Rows labels = list(actual_labels_vectors.keys()) for i in range(len(labels)): file.write(labels[i] + ",") for j in range(len(weights[labels[i]])): file.write(str(weights[labels[i]][j])) if j != len(weights[labels[i]]) - 1: file.write(",") else: file.write("\n") print(f"Computed weights saved to {save_filename}")