aboutsummaryrefslogtreecommitdiffstats
path: root/module/dataset_manip.py
diff options
context:
space:
mode:
Diffstat (limited to 'module/dataset_manip.py')
-rw-r--r--module/dataset_manip.py108
1 files changed, 108 insertions, 0 deletions
diff --git a/module/dataset_manip.py b/module/dataset_manip.py
new file mode 100644
index 0000000..baea37b
--- /dev/null
+++ b/module/dataset_manip.py
@@ -0,0 +1,108 @@
+from math import isnan
+from module.math import get_mean, get_quartiles, get_std
+
+
+def _is_missing(n):
+ """Check if a value is None or NaN."""
+ return n is None or (isinstance(n, float) and isnan(n))
+
+
+def read_csv(filename):
+ """
+ Reads a CSV file and returns a dictionary where each key is a column header, and the
+ corresponding value is a list of values from that column.
+
+ ```
+ {
+ "header_1": ['val1', 'val2', 'val3', ...],
+ "header_2": ['val1', 'val2', 'val3', ...],
+ ...
+ }
+ ```
+ """
+ try:
+ with open(filename) as file:
+ # First line is the CSV column headers
+ column_headers = file.readline().strip().split(",")
+ result = {header: [] for header in column_headers}
+
+ for line in file:
+ fields = line.strip().split(",")
+ # Add each value from the entry into the corresponding list of the result dict
+ for i in range(len(fields)):
+ result[column_headers[i]].append(fields[i])
+
+ return result
+
+ except FileNotFoundError:
+ print(f"{filename} not found")
+
+
+def parse_csv(filename, numerical_features, extra_features=[]):
+ """
+ Reads and parses a CSV file and returns a dictionary where each key is a column header, and the
+ corresponding value is a list of values from that CSV column.
+
+ Columns are filtered to only include those whose header is in the numerical_features or
+ extra_features lists. Numerical features are converted to floats. They are set to None if
+ conversion is not possible.
+
+ Parameters:
+ filename (str): The name of the CSV file to parse.
+
+ numerical_features (list): A list of strings corresponding to the titles of the numerical\
+ features of the CSV file. These will be converted to float or None.
+
+ extra_features (list): A list of strings corresponding to the tiles of extra features to\
+ keep in the returned data.
+
+ Returns:
+ dict: A dictionary where each key is a CSV column title and the corresponding value is a
+ list of values from that column:
+ ```
+ {
+ "header_1": ['val1', 'val2', 'val3', ...],
+ "header_2": ['val1', 'val2', 'val3', ...],
+ ...
+ }
+ ```
+ """
+ data = read_csv(filename)
+ # Filter unselected features
+ data = {k: v for k, v in data.items() if k in numerical_features + extra_features}
+ # Convert numerical values from CSV (str) into numbers (float)
+ for key in numerical_features:
+ converted_values = []
+ for i in range(len(data[key])):
+ try:
+ converted_values.append(float(data[key][i]))
+ except ValueError:
+ converted_values.append(None) # Set missing values to None
+ data[key] = converted_values
+
+ return data
+
+
+def impute_mean(l):
+ """Returns a list of float for a list of float|None, replacing None/NaN values with the mean."""
+ # Get a list without the missing values in order to calculate the mean
+ l_cleaned = [n for n in l if not _is_missing(n)]
+ mean = get_mean(l_cleaned)
+ return [n if not _is_missing(n) else mean for n in l]
+
+
+def impute_median(l):
+ """Returns a list of float for a list of float|None, replacing None/NaN values with the median."""
+ # Get a list without the missing values in order to calculate the median
+ l_cleaned = [n for n in l if not _is_missing(n)]
+ median = get_quartiles(l_cleaned)[1]
+ return [n if not _is_missing(n) else median for n in l]
+
+
+def standardize(l):
+ """Returns a a list of float|None, standardizing the values."""
+ # Get a list without the missing values in order to calculate the median
+ l_cleaned = [n for n in l if not _is_missing(n)]
+ mean = get_mean(l_cleaned)
+ std = get_std(l_cleaned)
+ return [(n - mean) / std if not _is_missing(n) else None for n in l]