3 files changed, 373 insertions, 0 deletions
diff --git a/module/constants.py b/module/constants.py
new file mode 100644
index 0000000..accb4c7
--- /dev/null
+++ b/module/constants.py
@@ -0,0 +1,31 @@
+HOUSE_FEATURE_CSV_TITLE = "Hogwarts House"
+
+NUMERICAL_FEATURE_CSV_TITLES = [
+    "Arithmancy",
+    "Astronomy",
+    "Herbology",
+    "Defense Against the Dark Arts",
+    "Divination",
+    "Muggle Studies",
+    "Ancient Runes",
+    "History of Magic",
+    "Transfiguration",
+    "Potions",
+    "Care of Magical Creatures",
+    "Charms",
+    "Flying",
+]
+
+HOUSE_COLORS = {
+    "gryffindor": "red",
+    "slytherin": "green",
+    "ravenclaw": "blue",
+    "hufflepuff": "yellow",
+}
+
+# List of features selected to perform the logistic regression
+LOGISTIC_REGRESSION_FEATURE_TITLES = [
+    "Astronomy",
+    "Defense Against the Dark Arts",
+    "Charms",
+]
diff --git a/module/dataset_manip.py b/module/dataset_manip.py
new file mode 100644
index 0000000..baea37b
--- /dev/null
+++ b/module/dataset_manip.py
@@ -0,0 +1,108 @@
+from math import isnan
+from module.math import get_mean, get_quartiles, get_std
+
+
+def _is_missing(n):
+    """Check if a value is None or NaN."""
+    return n is None or (isinstance(n, float) and isnan(n))
+
+
+def read_csv(filename):
+    """
+    Reads a CSV file and returns a dictionary where each key is a column header, and the
+    corresponding value is a list of values from that column.
+
+    ```
+    {
+        "header_1": ['val1', 'val2', 'val3', ...],
+        "header_2": ['val1', 'val2', 'val3', ...],
+        ...
+    }
+    ```
+    """
+    try:
+        with open(filename) as file:
+            # First line is the CSV column headers
+            column_headers = file.readline().strip().split(",")
+            result = {header: [] for header in column_headers}
+
+            for line in file:
+                fields = line.strip().split(",")
+                # Add each value from the entry into the corresponding list of the result dict
+                for i in range(len(fields)):
+                    result[column_headers[i]].append(fields[i])
+
+            return result
+
+    except FileNotFoundError:
+        print(f"{filename} not found")
+
+
+def parse_csv(filename, numerical_features, extra_features=[]):
+    """
+    Reads and parses a CSV file and returns a dictionary where each key is a column header, and the
+    corresponding value is a list of values from that CSV column.
+
+    Columns are filtered to only include those whose header is in the numerical_features or
+    extra_features lists. Numerical features are converted to floats. They are set to None if
+    conversion is not possible.
+
+    Parameters:
+        filename (str): The name of the CSV file to parse.
+
+        numerical_features (list): A list of strings corresponding to the titles of the numerical\
+        features of the CSV file. These will be converted to float or None.
+
+        extra_features (list): A list of strings corresponding to the tiles of extra features to\
+        keep in the returned data.
+
+    Returns:
+        dict: A dictionary where each key is a CSV column title and the corresponding value is a
+        list of values from that column:
+        ```
+        {
+            "header_1": ['val1', 'val2', 'val3', ...],
+            "header_2": ['val1', 'val2', 'val3', ...],
+            ...
+        }
+        ```
+    """
+    data = read_csv(filename)
+    # Filter unselected features
+    data = {k: v for k, v in data.items() if k in numerical_features + extra_features}
+    # Convert numerical values from CSV (str) into numbers (float)
+    for key in numerical_features:
+        converted_values = []
+        for i in range(len(data[key])):
+            try:
+                converted_values.append(float(data[key][i]))
+            except ValueError:
+                converted_values.append(None)  # Set missing values to None
+        data[key] = converted_values
+
+    return data
+
+
+def impute_mean(l):
+    """Returns a list of float for a list of float|None, replacing None/NaN values with the mean."""
+    # Get a list without the missing values in order to calculate the mean
+    l_cleaned = [n for n in l if not _is_missing(n)]
+    mean = get_mean(l_cleaned)
+    return [n if not _is_missing(n) else mean for n in l]
+
+
+def impute_median(l):
+    """Returns a list of float for a list of float|None, replacing None/NaN values with the median."""
+    # Get a list without the missing values in order to calculate the median
+    l_cleaned = [n for n in l if not _is_missing(n)]
+    median = get_quartiles(l_cleaned)[1]
+    return [n if not _is_missing(n) else median for n in l]
+
+
+def standardize(l):
+    """Returns a a list of float|None, standardizing the values."""
+    # Get a list without the missing values in order to calculate the median
+    l_cleaned = [n for n in l if not _is_missing(n)]
+    mean = get_mean(l_cleaned)
+    std = get_std(l_cleaned)
+    return [(n - mean) / std if not _is_missing(n) else None for n in l]
diff --git a/module/math.py b/module/math.py
new file mode 100644
index 0000000..7259d6b
--- /dev/null
+++ b/module/math.py
@@ -0,0 +1,234 @@
+from math import exp, log, sqrt, floor, ceil
+
+
+def get_mean(l):
+    """Returns the mean of a list of numbers."""
+    return sum(l) / len(l)
+
+
+def get_min(l):
+    """Returns the lowest element of a list of numbers."""
+    lowest = l[0]
+    for n in l:
+        if n < lowest:
+            lowest = n
+    return lowest
+
+
+def get_max(l):
+    """Returns the biggest element of a list of numbers."""
+    highest = l[0]
+    for n in l:
+        if n > highest:
+            highest = n
+    return highest
+
+
+def get_variance(l):
+    """Returns the variance of a list of numbers."""
+    mean = get_mean(l)
+    deviations = [(n - mean) ** 2 for n in l]
+    variance = get_mean(deviations)
+    return variance
+
+
+def get_std(l):
+    """Returns the standard deviation of a list of numbers."""
+    return sqrt(get_variance(l))
+
+
+def get_quartiles(l):
+    """Returns a tuple of the three quartiles of a list of numbers."""
+    quartile_limits = [len(l) * 0.25, len(l) * 0.5, len(l) * 0.75]
+    sorted_l = sorted(l)
+    quartiles = []
+    for limit in quartile_limits:
+        if int(limit) == limit:  # If limit is a whole number
+            q = sorted_l[int(limit)]
+        else:
+            q = (sorted_l[floor(limit)] + sorted_l[ceil(limit)]) / 2
+        quartiles.append(q)
+    return tuple(quartiles)
+
+
+def get_sigmoid(x):
+    """Returns the sigmoid of a number."""
+    return 1 / (1 + exp(-x))
+
+
+def get_linear_combination(variables, coefficients):
+    """
+    Returns the linear combination of a set of variables and a set of coefficients.
+
+    Parameters:
+        variables (list): A list of variables
+        coefficients (list): A list of coefficients
+    Returns:
+        float: The sum of each variable weighted by each coefficient
+    """
+    result = 0.0
+    for v, c in zip(variables, coefficients):
+        result += v * c
+    return result
+
+
+def get_hypothesis(feature_vector, weight_vector):
+    """
+    Returns the probability that the output is true given a feature vector and its associated\
+    weight vector.
+
+    Parameters:
+        feature_vector (list): A list of numbers representing the features
+
+        weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\
+        element is the bias term
+
+    Returns:
+        float: The probability that the output is true (between 0 and 1)
+    """
+    h = get_linear_combination(feature_vector, weight_vector[:-1])
+    h += weight_vector[-1]  # Bias term
+    return get_sigmoid(h)
+
+
+def get_cost(feature_vector, weight_vector, actual_label):
+    """
+    Returns the logistic regression cost for a training example.
+
+    Parameters:
+        feature_vector (list): A list of numbers representing the features of one training example.
+
+        weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\
+        element is the bias term. Represents the weight of each feature for the hypothesis.
+
+        actual_label (int): The actual label of the training example (1 if true or 0 if false).
+
+    Returns:
+        float: The log-loss of the prediction for a training example.
+    """
+    h = get_hypothesis(feature_vector, weight_vector)
+    return -actual_label * log(h) - (1 - actual_label) * log(1 - h)
+
+
+def get_total_cost(feature_matrix, weight_vector, actual_labels):
+    """
+    Returns the logistic regression cost for a set of training examples.
+
+    Parameters:
+        feature_matrix (list): A list of list of numbers representing the features for each\
+        training example.
+
+        weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\
+        whose last element is the bias term. Represents the weight of each feature for the\
+        hypothesis.
+
+        actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\
+        false), one for each training example.
+
+    Returns:
+        float: The log-loss of the predictions across all training examples.
+    """
+    total_cost = 0.0
+    for feature_vector, actual_label in zip(feature_matrix, actual_labels):
+        total_cost += get_cost(feature_vector, weight_vector, actual_label)
+    training_example_count = len(feature_matrix)
+    return (1 / training_example_count) * total_cost
+
+
+def get_partial_derivative(hypotheses, actual_labels, column_vector):
+    """
+    Returns the partial derivative of the cost function for a given parameter.
+
+    Parameters:
+        hypotheses (list): A list of numbers representing the vector of predictions for all\
+        training examples.
+
+        actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\
+        false), one for each training example.
+
+        column_vector (list): A list of numbers representing the value of the parameter (feature)\
+        for which the partial derivative is to be computed, one for each training example.
+
+    Returns:
+        float: The partial derivative of the cost function with respect to the weight associated\
+        with the given parameter.
+    """
+    result = 0.0
+    for h, l, f in zip(hypotheses, actual_labels, column_vector):
+        result += (h - l) * f
+    training_example_count = len(hypotheses)
+    return (1 / training_example_count) * result
+
+
+def gradient_descent(
+    feature_matrix, weight_vector, actual_label_vector, learning_rate, iteration_count
+):
+    """
+    Apply the gradient descent algorithm to compute the weight vector that minimizes the cost\
+    function.
+
+    Additionally this function prints the cost upgrade during the descent's course.
+
+    Parameters:
+        feature_matrix (list): A list of list of numbers representing the features for each\
+        training example.
+
+        weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\
+        whose last element is the bias term. Represents the weight of each feature for the\
+        hypothesis.
+
+        actual_label_vector (list): A list of numbers representing the actual label vector, one for\
+        each training example.
+
+        learning_rate (int): The learning rate of the algorithm.
+
+        iteration_count (int): The count of gradient descent updates the algorithm will perform.
+
+    Returns:
+        (list): The updated weight vector.
+    """
+    # Copy the weight vector in order not to modify the original
+    updated_weight_vector = weight_vector[:]
+
+    # Get the column vectors of each feature
+    feature_count = len(feature_matrix[0])
+    column_vectors = [[] for _ in range(feature_count)]
+    for feature_vector in feature_matrix:
+        for i in range(len(feature_vector)):
+            column_vectors[i].append(feature_vector[i])
+    # Add a column of features equal to 1 for the bias term partial derivative computation
+    column_vectors.append([1 for _ in range(feature_count)])
+
+    # Save initial cost for printing and comparing during the algorithm's course
+    old_cost = get_total_cost(feature_matrix, weight_vector, actual_label_vector)
+
+    for i in range(iteration_count):
+        # Get the predictions for every training example
+        hypotheses = [
+            get_hypothesis(feature_vector, updated_weight_vector)
+            for feature_vector in feature_matrix
+        ]
+        # Get the partial derivative of each feature
+        gradients = [
+            get_partial_derivative(
+                hypotheses,
+                actual_label_vector,
+                column_vector,
+            )
+            for column_vector in column_vectors
+        ]
+        # Update the weights
+        updated_weight_vector = [
+            w - learning_rate * g for w, g in zip(updated_weight_vector, gradients)
+        ]
+        # Print cost update periodically
+        if (i + 1) % (iteration_count / 10) == 0:
+            new_cost = get_total_cost(
+                feature_matrix, updated_weight_vector, actual_label_vector
+            )
+            print(
+                f"Iteration: {i + 1:>10}, cost: {old_cost:>8.4f} -> {new_cost:>8.4f}, diff: {abs(new_cost - old_cost):>8.6f}"
+            )
+            old_cost = new_cost
+
+    return updated_weight_vector