diff options
| author | Thomas Vanbesien <tvanbesi@proton.me> | 2026-04-01 17:42:04 +0200 |
|---|---|---|
| committer | Thomas Vanbesien <tvanbesi@proton.me> | 2026-04-01 17:42:04 +0200 |
| commit | 32cd9b2be1763f872c800b17e1fa63f852fe91c1 (patch) | |
| tree | 8aee9bd7e81d8204faca701c0a852bcf7dc45de6 /module/dataset_manip.py | |
| download | DSLR-master.tar.gz DSLR-master.zip | |
Diffstat (limited to 'module/dataset_manip.py')
| -rw-r--r-- | module/dataset_manip.py | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/module/dataset_manip.py b/module/dataset_manip.py new file mode 100644 index 0000000..baea37b --- /dev/null +++ b/module/dataset_manip.py @@ -0,0 +1,108 @@ +from math import isnan +from module.math import get_mean, get_quartiles, get_std + + +def _is_missing(n): + """Check if a value is None or NaN.""" + return n is None or (isinstance(n, float) and isnan(n)) + + +def read_csv(filename): + """ + Reads a CSV file and returns a dictionary where each key is a column header, and the + corresponding value is a list of values from that column. + + ``` + { + "header_1": ['val1', 'val2', 'val3', ...], + "header_2": ['val1', 'val2', 'val3', ...], + ... + } + ``` + """ + try: + with open(filename) as file: + # First line is the CSV column headers + column_headers = file.readline().strip().split(",") + result = {header: [] for header in column_headers} + + for line in file: + fields = line.strip().split(",") + # Add each value from the entry into the corresponding list of the result dict + for i in range(len(fields)): + result[column_headers[i]].append(fields[i]) + + return result + + except FileNotFoundError: + print(f"{filename} not found") + + +def parse_csv(filename, numerical_features, extra_features=[]): + """ + Reads and parses a CSV file and returns a dictionary where each key is a column header, and the + corresponding value is a list of values from that CSV column. + + Columns are filtered to only include those whose header is in the numerical_features or + extra_features lists. Numerical features are converted to floats. They are set to None if + conversion is not possible. + + Parameters: + filename (str): The name of the CSV file to parse. + + numerical_features (list): A list of strings corresponding to the titles of the numerical\ + features of the CSV file. These will be converted to float or None. + + extra_features (list): A list of strings corresponding to the tiles of extra features to\ + keep in the returned data. + + Returns: + dict: A dictionary where each key is a CSV column title and the corresponding value is a + list of values from that column: + ``` + { + "header_1": ['val1', 'val2', 'val3', ...], + "header_2": ['val1', 'val2', 'val3', ...], + ... + } + ``` + """ + data = read_csv(filename) + # Filter unselected features + data = {k: v for k, v in data.items() if k in numerical_features + extra_features} + # Convert numerical values from CSV (str) into numbers (float) + for key in numerical_features: + converted_values = [] + for i in range(len(data[key])): + try: + converted_values.append(float(data[key][i])) + except ValueError: + converted_values.append(None) # Set missing values to None + data[key] = converted_values + + return data + + +def impute_mean(l): + """Returns a list of float for a list of float|None, replacing None/NaN values with the mean.""" + # Get a list without the missing values in order to calculate the mean + l_cleaned = [n for n in l if not _is_missing(n)] + mean = get_mean(l_cleaned) + return [n if not _is_missing(n) else mean for n in l] + + +def impute_median(l): + """Returns a list of float for a list of float|None, replacing None/NaN values with the median.""" + # Get a list without the missing values in order to calculate the median + l_cleaned = [n for n in l if not _is_missing(n)] + median = get_quartiles(l_cleaned)[1] + return [n if not _is_missing(n) else median for n in l] + + +def standardize(l): + """Returns a a list of float|None, standardizing the values.""" + # Get a list without the missing values in order to calculate the median + l_cleaned = [n for n in l if not _is_missing(n)] + mean = get_mean(l_cleaned) + std = get_std(l_cleaned) + return [(n - mean) / std if not _is_missing(n) else None for n in l] |
