aboutsummaryrefslogtreecommitdiffstats
path: root/module/dataset_manip.py
blob: baea37b5a873c31c88a987687a4381d33e0f4c4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from math import isnan
from module.math import get_mean, get_quartiles, get_std


def _is_missing(n):
    """Check if a value is None or NaN."""
    return n is None or (isinstance(n, float) and isnan(n))


def read_csv(filename):
    """
    Reads a CSV file and returns a dictionary where each key is a column header, and the
    corresponding value is a list of values from that column.

    ```
    {
        "header_1": ['val1', 'val2', 'val3', ...],
        "header_2": ['val1', 'val2', 'val3', ...],
        ...
    }
    ```
    """
    try:
        with open(filename) as file:
            # First line is the CSV column headers
            column_headers = file.readline().strip().split(",")
            result = {header: [] for header in column_headers}

            for line in file:
                fields = line.strip().split(",")
                # Add each value from the entry into the corresponding list of the result dict
                for i in range(len(fields)):
                    result[column_headers[i]].append(fields[i])

            return result

    except FileNotFoundError:
        print(f"{filename} not found")


def parse_csv(filename, numerical_features, extra_features=[]):
    """
    Reads and parses a CSV file and returns a dictionary where each key is a column header, and the
    corresponding value is a list of values from that CSV column.

    Columns are filtered to only include those whose header is in the numerical_features or
    extra_features lists. Numerical features are converted to floats. They are set to None if
    conversion is not possible.

    Parameters:
        filename (str): The name of the CSV file to parse.

        numerical_features (list): A list of strings corresponding to the titles of the numerical\
        features of the CSV file. These will be converted to float or None.

        extra_features (list): A list of strings corresponding to the tiles of extra features to\
        keep in the returned data.

    Returns:
        dict: A dictionary where each key is a CSV column title and the corresponding value is a
        list of values from that column:
        ```
        {
            "header_1": ['val1', 'val2', 'val3', ...],
            "header_2": ['val1', 'val2', 'val3', ...],
            ...
        }
        ```
    """
    data = read_csv(filename)
    # Filter unselected features
    data = {k: v for k, v in data.items() if k in numerical_features + extra_features}
    # Convert numerical values from CSV (str) into numbers (float)
    for key in numerical_features:
        converted_values = []
        for i in range(len(data[key])):
            try:
                converted_values.append(float(data[key][i]))
            except ValueError:
                converted_values.append(None)  # Set missing values to None
        data[key] = converted_values

    return data


def impute_mean(l):
    """Returns a list of float for a list of float|None, replacing None/NaN values with the mean."""
    # Get a list without the missing values in order to calculate the mean
    l_cleaned = [n for n in l if not _is_missing(n)]
    mean = get_mean(l_cleaned)
    return [n if not _is_missing(n) else mean for n in l]


def impute_median(l):
    """Returns a list of float for a list of float|None, replacing None/NaN values with the median."""
    # Get a list without the missing values in order to calculate the median
    l_cleaned = [n for n in l if not _is_missing(n)]
    median = get_quartiles(l_cleaned)[1]
    return [n if not _is_missing(n) else median for n in l]


def standardize(l):
    """Returns a a list of float|None, standardizing the values."""
    # Get a list without the missing values in order to calculate the median
    l_cleaned = [n for n in l if not _is_missing(n)]
    mean = get_mean(l_cleaned)
    std = get_std(l_cleaned)
    return [(n - mean) / std if not _is_missing(n) else None for n in l]