diff options
| author | Thomas Vanbesien <tvanbesi@proton.me> | 2026-04-01 17:42:04 +0200 |
|---|---|---|
| committer | Thomas Vanbesien <tvanbesi@proton.me> | 2026-04-01 17:42:04 +0200 |
| commit | 32cd9b2be1763f872c800b17e1fa63f852fe91c1 (patch) | |
| tree | 8aee9bd7e81d8204faca701c0a852bcf7dc45de6 /module | |
| download | DSLR-32cd9b2be1763f872c800b17e1fa63f852fe91c1.tar.gz DSLR-32cd9b2be1763f872c800b17e1fa63f852fe91c1.zip | |
Diffstat (limited to 'module')
| -rw-r--r-- | module/constants.py | 31 | ||||
| -rw-r--r-- | module/dataset_manip.py | 108 | ||||
| -rw-r--r-- | module/math.py | 234 |
3 files changed, 373 insertions, 0 deletions
diff --git a/module/constants.py b/module/constants.py new file mode 100644 index 0000000..accb4c7 --- /dev/null +++ b/module/constants.py @@ -0,0 +1,31 @@ +HOUSE_FEATURE_CSV_TITLE = "Hogwarts House" + +NUMERICAL_FEATURE_CSV_TITLES = [ + "Arithmancy", + "Astronomy", + "Herbology", + "Defense Against the Dark Arts", + "Divination", + "Muggle Studies", + "Ancient Runes", + "History of Magic", + "Transfiguration", + "Potions", + "Care of Magical Creatures", + "Charms", + "Flying", +] + +HOUSE_COLORS = { + "gryffindor": "red", + "slytherin": "green", + "ravenclaw": "blue", + "hufflepuff": "yellow", +} + +# List of features selected to perform the logistic regression +LOGISTIC_REGRESSION_FEATURE_TITLES = [ + "Astronomy", + "Defense Against the Dark Arts", + "Charms", +] diff --git a/module/dataset_manip.py b/module/dataset_manip.py new file mode 100644 index 0000000..baea37b --- /dev/null +++ b/module/dataset_manip.py @@ -0,0 +1,108 @@ +from math import isnan +from module.math import get_mean, get_quartiles, get_std + + +def _is_missing(n): + """Check if a value is None or NaN.""" + return n is None or (isinstance(n, float) and isnan(n)) + + +def read_csv(filename): + """ + Reads a CSV file and returns a dictionary where each key is a column header, and the + corresponding value is a list of values from that column. + + ``` + { + "header_1": ['val1', 'val2', 'val3', ...], + "header_2": ['val1', 'val2', 'val3', ...], + ... + } + ``` + """ + try: + with open(filename) as file: + # First line is the CSV column headers + column_headers = file.readline().strip().split(",") + result = {header: [] for header in column_headers} + + for line in file: + fields = line.strip().split(",") + # Add each value from the entry into the corresponding list of the result dict + for i in range(len(fields)): + result[column_headers[i]].append(fields[i]) + + return result + + except FileNotFoundError: + print(f"{filename} not found") + + +def parse_csv(filename, numerical_features, extra_features=[]): + """ + Reads and parses a CSV file and returns a dictionary where each key is a column header, and the + corresponding value is a list of values from that CSV column. + + Columns are filtered to only include those whose header is in the numerical_features or + extra_features lists. Numerical features are converted to floats. They are set to None if + conversion is not possible. + + Parameters: + filename (str): The name of the CSV file to parse. + + numerical_features (list): A list of strings corresponding to the titles of the numerical\ + features of the CSV file. These will be converted to float or None. + + extra_features (list): A list of strings corresponding to the tiles of extra features to\ + keep in the returned data. + + Returns: + dict: A dictionary where each key is a CSV column title and the corresponding value is a + list of values from that column: + ``` + { + "header_1": ['val1', 'val2', 'val3', ...], + "header_2": ['val1', 'val2', 'val3', ...], + ... + } + ``` + """ + data = read_csv(filename) + # Filter unselected features + data = {k: v for k, v in data.items() if k in numerical_features + extra_features} + # Convert numerical values from CSV (str) into numbers (float) + for key in numerical_features: + converted_values = [] + for i in range(len(data[key])): + try: + converted_values.append(float(data[key][i])) + except ValueError: + converted_values.append(None) # Set missing values to None + data[key] = converted_values + + return data + + +def impute_mean(l): + """Returns a list of float for a list of float|None, replacing None/NaN values with the mean.""" + # Get a list without the missing values in order to calculate the mean + l_cleaned = [n for n in l if not _is_missing(n)] + mean = get_mean(l_cleaned) + return [n if not _is_missing(n) else mean for n in l] + + +def impute_median(l): + """Returns a list of float for a list of float|None, replacing None/NaN values with the median.""" + # Get a list without the missing values in order to calculate the median + l_cleaned = [n for n in l if not _is_missing(n)] + median = get_quartiles(l_cleaned)[1] + return [n if not _is_missing(n) else median for n in l] + + +def standardize(l): + """Returns a a list of float|None, standardizing the values.""" + # Get a list without the missing values in order to calculate the median + l_cleaned = [n for n in l if not _is_missing(n)] + mean = get_mean(l_cleaned) + std = get_std(l_cleaned) + return [(n - mean) / std if not _is_missing(n) else None for n in l] diff --git a/module/math.py b/module/math.py new file mode 100644 index 0000000..7259d6b --- /dev/null +++ b/module/math.py @@ -0,0 +1,234 @@ +from math import exp, log, sqrt, floor, ceil + + +def get_mean(l): + """Returns the mean of a list of numbers.""" + return sum(l) / len(l) + + +def get_min(l): + """Returns the lowest element of a list of numbers.""" + lowest = l[0] + for n in l: + if n < lowest: + lowest = n + return lowest + + +def get_max(l): + """Returns the biggest element of a list of numbers.""" + highest = l[0] + for n in l: + if n > highest: + highest = n + return highest + + +def get_variance(l): + """Returns the variance of a list of numbers.""" + mean = get_mean(l) + deviations = [(n - mean) ** 2 for n in l] + variance = get_mean(deviations) + return variance + + +def get_std(l): + """Returns the standard deviation of a list of numbers.""" + return sqrt(get_variance(l)) + + +def get_quartiles(l): + """Returns a tuple of the three quartiles of a list of numbers.""" + quartile_limits = [len(l) * 0.25, len(l) * 0.5, len(l) * 0.75] + sorted_l = sorted(l) + quartiles = [] + for limit in quartile_limits: + if int(limit) == limit: # If limit is a whole number + q = sorted_l[int(limit)] + else: + q = (sorted_l[floor(limit)] + sorted_l[ceil(limit)]) / 2 + quartiles.append(q) + return tuple(quartiles) + + +def get_sigmoid(x): + """Returns the sigmoid of a number.""" + return 1 / (1 + exp(-x)) + + +def get_linear_combination(variables, coefficients): + """ + Returns the linear combination of a set of variables and a set of coefficients. + + Parameters: + variables (list): A list of variables + coefficients (list): A list of coefficients + Returns: + float: The sum of each variable weighted by each coefficient + """ + result = 0.0 + for v, c in zip(variables, coefficients): + result += v * c + return result + + +def get_hypothesis(feature_vector, weight_vector): + """ + Returns the probability that the output is true given a feature vector and its associated\ + weight vector. + + Parameters: + feature_vector (list): A list of numbers representing the features + + weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\ + element is the bias term + + Returns: + float: The probability that the output is true (between 0 and 1) + """ + h = get_linear_combination(feature_vector, weight_vector[:-1]) + h += weight_vector[-1] # Bias term + return get_sigmoid(h) + + +def get_cost(feature_vector, weight_vector, actual_label): + """ + Returns the logistic regression cost for a training example. + + Parameters: + feature_vector (list): A list of numbers representing the features of one training example. + + weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\ + element is the bias term. Represents the weight of each feature for the hypothesis. + + actual_label (int): The actual label of the training example (1 if true or 0 if false). + + Returns: + float: The log-loss of the prediction for a training example. + """ + h = get_hypothesis(feature_vector, weight_vector) + return -actual_label * log(h) - (1 - actual_label) * log(1 - h) + + +def get_total_cost(feature_matrix, weight_vector, actual_labels): + """ + Returns the logistic regression cost for a set of training examples. + + Parameters: + feature_matrix (list): A list of list of numbers representing the features for each\ + training example. + + weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\ + whose last element is the bias term. Represents the weight of each feature for the\ + hypothesis. + + actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\ + false), one for each training example. + + Returns: + float: The log-loss of the predictions across all training examples. + """ + total_cost = 0.0 + for feature_vector, actual_label in zip(feature_matrix, actual_labels): + total_cost += get_cost(feature_vector, weight_vector, actual_label) + training_example_count = len(feature_matrix) + return (1 / training_example_count) * total_cost + + +def get_partial_derivative(hypotheses, actual_labels, column_vector): + """ + Returns the partial derivative of the cost function for a given parameter. + + Parameters: + hypotheses (list): A list of numbers representing the vector of predictions for all\ + training examples. + + actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\ + false), one for each training example. + + column_vector (list): A list of numbers representing the value of the parameter (feature)\ + for which the partial derivative is to be computed, one for each training example. + + Returns: + float: The partial derivative of the cost function with respect to the weight associated\ + with the given parameter. + """ + result = 0.0 + for h, l, f in zip(hypotheses, actual_labels, column_vector): + result += (h - l) * f + training_example_count = len(hypotheses) + return (1 / training_example_count) * result + + +def gradient_descent( + feature_matrix, weight_vector, actual_label_vector, learning_rate, iteration_count +): + """ + Apply the gradient descent algorithm to compute the weight vector that minimizes the cost\ + function. + + Additionally this function prints the cost upgrade during the descent's course. + + Parameters: + feature_matrix (list): A list of list of numbers representing the features for each\ + training example. + + weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\ + whose last element is the bias term. Represents the weight of each feature for the\ + hypothesis. + + actual_label_vector (list): A list of numbers representing the actual label vector, one for\ + each training example. + + learning_rate (int): The learning rate of the algorithm. + + iteration_count (int): The count of gradient descent updates the algorithm will perform. + + Returns: + (list): The updated weight vector. + """ + # Copy the weight vector in order not to modify the original + updated_weight_vector = weight_vector[:] + + # Get the column vectors of each feature + feature_count = len(feature_matrix[0]) + column_vectors = [[] for _ in range(feature_count)] + for feature_vector in feature_matrix: + for i in range(len(feature_vector)): + column_vectors[i].append(feature_vector[i]) + # Add a column of features equal to 1 for the bias term partial derivative computation + column_vectors.append([1 for _ in range(feature_count)]) + + # Save initial cost for printing and comparing during the algorithm's course + old_cost = get_total_cost(feature_matrix, weight_vector, actual_label_vector) + + for i in range(iteration_count): + # Get the predictions for every training example + hypotheses = [ + get_hypothesis(feature_vector, updated_weight_vector) + for feature_vector in feature_matrix + ] + # Get the partial derivative of each feature + gradients = [ + get_partial_derivative( + hypotheses, + actual_label_vector, + column_vector, + ) + for column_vector in column_vectors + ] + # Update the weights + updated_weight_vector = [ + w - learning_rate * g for w, g in zip(updated_weight_vector, gradients) + ] + # Print cost update periodically + if (i + 1) % (iteration_count / 10) == 0: + new_cost = get_total_cost( + feature_matrix, updated_weight_vector, actual_label_vector + ) + print( + f"Iteration: {i + 1:>10}, cost: {old_cost:>8.4f} -> {new_cost:>8.4f}, diff: {abs(new_cost - old_cost):>8.6f}" + ) + old_cost = new_cost + + return updated_weight_vector |
