aboutsummaryrefslogtreecommitdiffstats
path: root/module
diff options
context:
space:
mode:
Diffstat (limited to 'module')
-rw-r--r--module/constants.py31
-rw-r--r--module/dataset_manip.py108
-rw-r--r--module/math.py234
3 files changed, 373 insertions, 0 deletions
diff --git a/module/constants.py b/module/constants.py
new file mode 100644
index 0000000..accb4c7
--- /dev/null
+++ b/module/constants.py
@@ -0,0 +1,31 @@
+HOUSE_FEATURE_CSV_TITLE = "Hogwarts House"
+
+NUMERICAL_FEATURE_CSV_TITLES = [
+ "Arithmancy",
+ "Astronomy",
+ "Herbology",
+ "Defense Against the Dark Arts",
+ "Divination",
+ "Muggle Studies",
+ "Ancient Runes",
+ "History of Magic",
+ "Transfiguration",
+ "Potions",
+ "Care of Magical Creatures",
+ "Charms",
+ "Flying",
+]
+
+HOUSE_COLORS = {
+ "gryffindor": "red",
+ "slytherin": "green",
+ "ravenclaw": "blue",
+ "hufflepuff": "yellow",
+}
+
+# List of features selected to perform the logistic regression
+LOGISTIC_REGRESSION_FEATURE_TITLES = [
+ "Astronomy",
+ "Defense Against the Dark Arts",
+ "Charms",
+]
diff --git a/module/dataset_manip.py b/module/dataset_manip.py
new file mode 100644
index 0000000..baea37b
--- /dev/null
+++ b/module/dataset_manip.py
@@ -0,0 +1,108 @@
+from math import isnan
+from module.math import get_mean, get_quartiles, get_std
+
+
+def _is_missing(n):
+ """Check if a value is None or NaN."""
+ return n is None or (isinstance(n, float) and isnan(n))
+
+
+def read_csv(filename):
+ """
+ Reads a CSV file and returns a dictionary where each key is a column header, and the
+ corresponding value is a list of values from that column.
+
+ ```
+ {
+ "header_1": ['val1', 'val2', 'val3', ...],
+ "header_2": ['val1', 'val2', 'val3', ...],
+ ...
+ }
+ ```
+ """
+ try:
+ with open(filename) as file:
+ # First line is the CSV column headers
+ column_headers = file.readline().strip().split(",")
+ result = {header: [] for header in column_headers}
+
+ for line in file:
+ fields = line.strip().split(",")
+ # Add each value from the entry into the corresponding list of the result dict
+ for i in range(len(fields)):
+ result[column_headers[i]].append(fields[i])
+
+ return result
+
+ except FileNotFoundError:
+ print(f"{filename} not found")
+
+
+def parse_csv(filename, numerical_features, extra_features=[]):
+ """
+ Reads and parses a CSV file and returns a dictionary where each key is a column header, and the
+ corresponding value is a list of values from that CSV column.
+
+ Columns are filtered to only include those whose header is in the numerical_features or
+ extra_features lists. Numerical features are converted to floats. They are set to None if
+ conversion is not possible.
+
+ Parameters:
+ filename (str): The name of the CSV file to parse.
+
+ numerical_features (list): A list of strings corresponding to the titles of the numerical\
+ features of the CSV file. These will be converted to float or None.
+
+ extra_features (list): A list of strings corresponding to the tiles of extra features to\
+ keep in the returned data.
+
+ Returns:
+ dict: A dictionary where each key is a CSV column title and the corresponding value is a
+ list of values from that column:
+ ```
+ {
+ "header_1": ['val1', 'val2', 'val3', ...],
+ "header_2": ['val1', 'val2', 'val3', ...],
+ ...
+ }
+ ```
+ """
+ data = read_csv(filename)
+ # Filter unselected features
+ data = {k: v for k, v in data.items() if k in numerical_features + extra_features}
+ # Convert numerical values from CSV (str) into numbers (float)
+ for key in numerical_features:
+ converted_values = []
+ for i in range(len(data[key])):
+ try:
+ converted_values.append(float(data[key][i]))
+ except ValueError:
+ converted_values.append(None) # Set missing values to None
+ data[key] = converted_values
+
+ return data
+
+
+def impute_mean(l):
+ """Returns a list of float for a list of float|None, replacing None/NaN values with the mean."""
+ # Get a list without the missing values in order to calculate the mean
+ l_cleaned = [n for n in l if not _is_missing(n)]
+ mean = get_mean(l_cleaned)
+ return [n if not _is_missing(n) else mean for n in l]
+
+
+def impute_median(l):
+ """Returns a list of float for a list of float|None, replacing None/NaN values with the median."""
+ # Get a list without the missing values in order to calculate the median
+ l_cleaned = [n for n in l if not _is_missing(n)]
+ median = get_quartiles(l_cleaned)[1]
+ return [n if not _is_missing(n) else median for n in l]
+
+
+def standardize(l):
+ """Returns a a list of float|None, standardizing the values."""
+ # Get a list without the missing values in order to calculate the median
+ l_cleaned = [n for n in l if not _is_missing(n)]
+ mean = get_mean(l_cleaned)
+ std = get_std(l_cleaned)
+ return [(n - mean) / std if not _is_missing(n) else None for n in l]
diff --git a/module/math.py b/module/math.py
new file mode 100644
index 0000000..7259d6b
--- /dev/null
+++ b/module/math.py
@@ -0,0 +1,234 @@
+from math import exp, log, sqrt, floor, ceil
+
+
+def get_mean(l):
+ """Returns the mean of a list of numbers."""
+ return sum(l) / len(l)
+
+
+def get_min(l):
+ """Returns the lowest element of a list of numbers."""
+ lowest = l[0]
+ for n in l:
+ if n < lowest:
+ lowest = n
+ return lowest
+
+
+def get_max(l):
+ """Returns the biggest element of a list of numbers."""
+ highest = l[0]
+ for n in l:
+ if n > highest:
+ highest = n
+ return highest
+
+
+def get_variance(l):
+ """Returns the variance of a list of numbers."""
+ mean = get_mean(l)
+ deviations = [(n - mean) ** 2 for n in l]
+ variance = get_mean(deviations)
+ return variance
+
+
+def get_std(l):
+ """Returns the standard deviation of a list of numbers."""
+ return sqrt(get_variance(l))
+
+
+def get_quartiles(l):
+ """Returns a tuple of the three quartiles of a list of numbers."""
+ quartile_limits = [len(l) * 0.25, len(l) * 0.5, len(l) * 0.75]
+ sorted_l = sorted(l)
+ quartiles = []
+ for limit in quartile_limits:
+ if int(limit) == limit: # If limit is a whole number
+ q = sorted_l[int(limit)]
+ else:
+ q = (sorted_l[floor(limit)] + sorted_l[ceil(limit)]) / 2
+ quartiles.append(q)
+ return tuple(quartiles)
+
+
+def get_sigmoid(x):
+ """Returns the sigmoid of a number."""
+ return 1 / (1 + exp(-x))
+
+
+def get_linear_combination(variables, coefficients):
+ """
+ Returns the linear combination of a set of variables and a set of coefficients.
+
+ Parameters:
+ variables (list): A list of variables
+ coefficients (list): A list of coefficients
+ Returns:
+ float: The sum of each variable weighted by each coefficient
+ """
+ result = 0.0
+ for v, c in zip(variables, coefficients):
+ result += v * c
+ return result
+
+
+def get_hypothesis(feature_vector, weight_vector):
+ """
+ Returns the probability that the output is true given a feature vector and its associated\
+ weight vector.
+
+ Parameters:
+ feature_vector (list): A list of numbers representing the features
+
+ weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\
+ element is the bias term
+
+ Returns:
+ float: The probability that the output is true (between 0 and 1)
+ """
+ h = get_linear_combination(feature_vector, weight_vector[:-1])
+ h += weight_vector[-1] # Bias term
+ return get_sigmoid(h)
+
+
+def get_cost(feature_vector, weight_vector, actual_label):
+ """
+ Returns the logistic regression cost for a training example.
+
+ Parameters:
+ feature_vector (list): A list of numbers representing the features of one training example.
+
+ weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\
+ element is the bias term. Represents the weight of each feature for the hypothesis.
+
+ actual_label (int): The actual label of the training example (1 if true or 0 if false).
+
+ Returns:
+ float: The log-loss of the prediction for a training example.
+ """
+ h = get_hypothesis(feature_vector, weight_vector)
+ return -actual_label * log(h) - (1 - actual_label) * log(1 - h)
+
+
+def get_total_cost(feature_matrix, weight_vector, actual_labels):
+ """
+ Returns the logistic regression cost for a set of training examples.
+
+ Parameters:
+ feature_matrix (list): A list of list of numbers representing the features for each\
+ training example.
+
+ weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\
+ whose last element is the bias term. Represents the weight of each feature for the\
+ hypothesis.
+
+ actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\
+ false), one for each training example.
+
+ Returns:
+ float: The log-loss of the predictions across all training examples.
+ """
+ total_cost = 0.0
+ for feature_vector, actual_label in zip(feature_matrix, actual_labels):
+ total_cost += get_cost(feature_vector, weight_vector, actual_label)
+ training_example_count = len(feature_matrix)
+ return (1 / training_example_count) * total_cost
+
+
+def get_partial_derivative(hypotheses, actual_labels, column_vector):
+ """
+ Returns the partial derivative of the cost function for a given parameter.
+
+ Parameters:
+ hypotheses (list): A list of numbers representing the vector of predictions for all\
+ training examples.
+
+ actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\
+ false), one for each training example.
+
+ column_vector (list): A list of numbers representing the value of the parameter (feature)\
+ for which the partial derivative is to be computed, one for each training example.
+
+ Returns:
+ float: The partial derivative of the cost function with respect to the weight associated\
+ with the given parameter.
+ """
+ result = 0.0
+ for h, l, f in zip(hypotheses, actual_labels, column_vector):
+ result += (h - l) * f
+ training_example_count = len(hypotheses)
+ return (1 / training_example_count) * result
+
+
+def gradient_descent(
+ feature_matrix, weight_vector, actual_label_vector, learning_rate, iteration_count
+):
+ """
+ Apply the gradient descent algorithm to compute the weight vector that minimizes the cost\
+ function.
+
+ Additionally this function prints the cost upgrade during the descent's course.
+
+ Parameters:
+ feature_matrix (list): A list of list of numbers representing the features for each\
+ training example.
+
+ weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\
+ whose last element is the bias term. Represents the weight of each feature for the\
+ hypothesis.
+
+ actual_label_vector (list): A list of numbers representing the actual label vector, one for\
+ each training example.
+
+ learning_rate (int): The learning rate of the algorithm.
+
+ iteration_count (int): The count of gradient descent updates the algorithm will perform.
+
+ Returns:
+ (list): The updated weight vector.
+ """
+ # Copy the weight vector in order not to modify the original
+ updated_weight_vector = weight_vector[:]
+
+ # Get the column vectors of each feature
+ feature_count = len(feature_matrix[0])
+ column_vectors = [[] for _ in range(feature_count)]
+ for feature_vector in feature_matrix:
+ for i in range(len(feature_vector)):
+ column_vectors[i].append(feature_vector[i])
+ # Add a column of features equal to 1 for the bias term partial derivative computation
+ column_vectors.append([1 for _ in range(feature_count)])
+
+ # Save initial cost for printing and comparing during the algorithm's course
+ old_cost = get_total_cost(feature_matrix, weight_vector, actual_label_vector)
+
+ for i in range(iteration_count):
+ # Get the predictions for every training example
+ hypotheses = [
+ get_hypothesis(feature_vector, updated_weight_vector)
+ for feature_vector in feature_matrix
+ ]
+ # Get the partial derivative of each feature
+ gradients = [
+ get_partial_derivative(
+ hypotheses,
+ actual_label_vector,
+ column_vector,
+ )
+ for column_vector in column_vectors
+ ]
+ # Update the weights
+ updated_weight_vector = [
+ w - learning_rate * g for w, g in zip(updated_weight_vector, gradients)
+ ]
+ # Print cost update periodically
+ if (i + 1) % (iteration_count / 10) == 0:
+ new_cost = get_total_cost(
+ feature_matrix, updated_weight_vector, actual_label_vector
+ )
+ print(
+ f"Iteration: {i + 1:>10}, cost: {old_cost:>8.4f} -> {new_cost:>8.4f}, diff: {abs(new_cost - old_cost):>8.6f}"
+ )
+ old_cost = new_cost
+
+ return updated_weight_vector