1 files changed, 108 insertions, 0 deletions
diff --git a/module/dataset_manip.py b/module/dataset_manip.py
new file mode 100644
index 0000000..baea37b
--- /dev/null
+++ b/module/dataset_manip.py
@@ -0,0 +1,108 @@
+from math import isnan
+from module.math import get_mean, get_quartiles, get_std
+
+
+def _is_missing(n):
+    """Check if a value is None or NaN."""
+    return n is None or (isinstance(n, float) and isnan(n))
+
+
+def read_csv(filename):
+    """
+    Reads a CSV file and returns a dictionary where each key is a column header, and the
+    corresponding value is a list of values from that column.
+
+    ```
+    {
+        "header_1": ['val1', 'val2', 'val3', ...],
+        "header_2": ['val1', 'val2', 'val3', ...],
+        ...
+    }
+    ```
+    """
+    try:
+        with open(filename) as file:
+            # First line is the CSV column headers
+            column_headers = file.readline().strip().split(",")
+            result = {header: [] for header in column_headers}
+
+            for line in file:
+                fields = line.strip().split(",")
+                # Add each value from the entry into the corresponding list of the result dict
+                for i in range(len(fields)):
+                    result[column_headers[i]].append(fields[i])
+
+            return result
+
+    except FileNotFoundError:
+        print(f"{filename} not found")
+
+
+def parse_csv(filename, numerical_features, extra_features=[]):
+    """
+    Reads and parses a CSV file and returns a dictionary where each key is a column header, and the
+    corresponding value is a list of values from that CSV column.
+
+    Columns are filtered to only include those whose header is in the numerical_features or
+    extra_features lists. Numerical features are converted to floats. They are set to None if
+    conversion is not possible.
+
+    Parameters:
+        filename (str): The name of the CSV file to parse.
+
+        numerical_features (list): A list of strings corresponding to the titles of the numerical\
+        features of the CSV file. These will be converted to float or None.
+
+        extra_features (list): A list of strings corresponding to the tiles of extra features to\
+        keep in the returned data.
+
+    Returns:
+        dict: A dictionary where each key is a CSV column title and the corresponding value is a
+        list of values from that column:
+        ```
+        {
+            "header_1": ['val1', 'val2', 'val3', ...],
+            "header_2": ['val1', 'val2', 'val3', ...],
+            ...
+        }
+        ```
+    """
+    data = read_csv(filename)
+    # Filter unselected features
+    data = {k: v for k, v in data.items() if k in numerical_features + extra_features}
+    # Convert numerical values from CSV (str) into numbers (float)
+    for key in numerical_features:
+        converted_values = []
+        for i in range(len(data[key])):
+            try:
+                converted_values.append(float(data[key][i]))
+            except ValueError:
+                converted_values.append(None)  # Set missing values to None
+        data[key] = converted_values
+
+    return data
+
+
+def impute_mean(l):
+    """Returns a list of float for a list of float|None, replacing None/NaN values with the mean."""
+    # Get a list without the missing values in order to calculate the mean
+    l_cleaned = [n for n in l if not _is_missing(n)]
+    mean = get_mean(l_cleaned)
+    return [n if not _is_missing(n) else mean for n in l]
+
+
+def impute_median(l):
+    """Returns a list of float for a list of float|None, replacing None/NaN values with the median."""
+    # Get a list without the missing values in order to calculate the median
+    l_cleaned = [n for n in l if not _is_missing(n)]
+    median = get_quartiles(l_cleaned)[1]
+    return [n if not _is_missing(n) else median for n in l]
+
+
+def standardize(l):
+    """Returns a a list of float|None, standardizing the values."""
+    # Get a list without the missing values in order to calculate the median
+    l_cleaned = [n for n in l if not _is_missing(n)]
+    mean = get_mean(l_cleaned)
+    std = get_std(l_cleaned)
+    return [(n - mean) / std if not _is_missing(n) else None for n in l]