from math import isnan from module.math import get_mean, get_quartiles, get_std def _is_missing(n): """Check if a value is None or NaN.""" return n is None or (isinstance(n, float) and isnan(n)) def read_csv(filename): """ Reads a CSV file and returns a dictionary where each key is a column header, and the corresponding value is a list of values from that column. ``` { "header_1": ['val1', 'val2', 'val3', ...], "header_2": ['val1', 'val2', 'val3', ...], ... } ``` """ try: with open(filename) as file: # First line is the CSV column headers column_headers = file.readline().strip().split(",") result = {header: [] for header in column_headers} for line in file: fields = line.strip().split(",") # Add each value from the entry into the corresponding list of the result dict for i in range(len(fields)): result[column_headers[i]].append(fields[i]) return result except FileNotFoundError: print(f"{filename} not found") def parse_csv(filename, numerical_features, extra_features=[]): """ Reads and parses a CSV file and returns a dictionary where each key is a column header, and the corresponding value is a list of values from that CSV column. Columns are filtered to only include those whose header is in the numerical_features or extra_features lists. Numerical features are converted to floats. They are set to None if conversion is not possible. Parameters: filename (str): The name of the CSV file to parse. numerical_features (list): A list of strings corresponding to the titles of the numerical\ features of the CSV file. These will be converted to float or None. extra_features (list): A list of strings corresponding to the tiles of extra features to\ keep in the returned data. Returns: dict: A dictionary where each key is a CSV column title and the corresponding value is a list of values from that column: ``` { "header_1": ['val1', 'val2', 'val3', ...], "header_2": ['val1', 'val2', 'val3', ...], ... } ``` """ data = read_csv(filename) # Filter unselected features data = {k: v for k, v in data.items() if k in numerical_features + extra_features} # Convert numerical values from CSV (str) into numbers (float) for key in numerical_features: converted_values = [] for i in range(len(data[key])): try: converted_values.append(float(data[key][i])) except ValueError: converted_values.append(None) # Set missing values to None data[key] = converted_values return data def impute_mean(l): """Returns a list of float for a list of float|None, replacing None/NaN values with the mean.""" # Get a list without the missing values in order to calculate the mean l_cleaned = [n for n in l if not _is_missing(n)] mean = get_mean(l_cleaned) return [n if not _is_missing(n) else mean for n in l] def impute_median(l): """Returns a list of float for a list of float|None, replacing None/NaN values with the median.""" # Get a list without the missing values in order to calculate the median l_cleaned = [n for n in l if not _is_missing(n)] median = get_quartiles(l_cleaned)[1] return [n if not _is_missing(n) else median for n in l] def standardize(l): """Returns a a list of float|None, standardizing the values.""" # Get a list without the missing values in order to calculate the median l_cleaned = [n for n in l if not _is_missing(n)] mean = get_mean(l_cleaned) std = get_std(l_cleaned) return [(n - mean) / std if not _is_missing(n) else None for n in l]