describe.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

import sys
from module.constants import NUMERICAL_FEATURE_CSV_TITLES
from module.dataset_manip import parse_csv
from module.math import get_mean, get_std, get_min, get_max, get_quartiles


def print_table(number_rows, h_headers, v_headers, decimal_precision=3):
    """
    Prints a table of numbers in a human-readable way on the standard output.

    Parameters:
        number_rows (list): A list of list of the numbers to print, one list for each row
        h_headers (list): A list of strings of the horizontal table headers
        v_headers (list): A list of strings of the vertical table headers
        decimal_precision (int): The amount of decimal points to display for each number
    """

    # Get the column width needed to display the number whose string representation is the widest
    # It includes the sign, digits before the decimal point, the decimal point and the decimal
    # places
    DECIMAL_PART_WIDTH = 1 + decimal_precision  # Width of the dot + decimal places
    max_column_width = 0
    for l in number_rows:
        for n in l:
            n_width = 1 if n < 0 else 0  # Minus sign width
            n_width += len(str(int(n)))  # Width of digits before the decimal point
            n_width += DECIMAL_PART_WIDTH
            if n_width > max_column_width:
                max_column_width = n_width

    # Truncate headers to fit the maximum column width
    truncated_h_headers = [s[:max_column_width] for s in h_headers]
    truncated_v_headers = [s[:max_column_width] for s in v_headers]
    # Replace last char with "." for each header if it was truncated
    for i in range(len(h_headers)):
        if len(h_headers[i]) != len(truncated_h_headers[i]):
            truncated_h_headers[i] = truncated_h_headers[i][:-1] + "."
    for i in range(len(v_headers)):
        if len(v_headers[i]) != len(truncated_v_headers[i]):
            truncated_v_headers[i] = truncated_v_headers[i][:-1] + "."

    # Print the table
    print(f"|{'':{max_column_width}}|", end="")  # First empty cell
    # Print headers
    column_count = len(truncated_h_headers)  # Not counting header column
    for i in range(column_count):
        print(f"{truncated_h_headers[i]:>{max_column_width}}|", end="")
    print()
    # Print rows
    row_count = len(v_headers)  # Not counting header row
    for i in range(row_count):
        # Print row header
        print(f"|{truncated_v_headers[i]:{max_column_width}}|", end="")
        row_numbers = number_rows[i]
        for n in row_numbers:
            print(f"{n:>{max_column_width}.{decimal_precision}f}|", end="")
        print()


if len(sys.argv) < 2:
    print(f"Usage: python {__file__} <dataset.csv>")
    exit(-1)

# Get data from CSV file
dataset_filename = sys.argv[1]
data = parse_csv(dataset_filename, NUMERICAL_FEATURE_CSV_TITLES)
# Remove None values from each feature vector
for k in data.keys():
    data[k] = [v for v in data[k] if v is not None]

# Get horizontal headers
features_names = list(data.keys())
# Get vertical headers
information_names = ["Count", "Mean", "Std", "Min", "25%", "50%", "75%", "Max"]
# Get rows of data
feature_value_lists = data.values()
rows = []
rows.append([len(l) for l in feature_value_lists])
rows.append([get_mean(l) for l in feature_value_lists])
rows.append([get_std(l) for l in feature_value_lists])
rows.append([get_min(l) for l in feature_value_lists])
q1_list, q2_list, q3_list = [], [], []
for l in feature_value_lists:
    quartiles = get_quartiles(l)
    q1_list.append(quartiles[0])
    q2_list.append(quartiles[1])
    q3_list.append(quartiles[2])
rows.extend([q1_list, q2_list, q3_list])
rows.append([get_max(a) for a in feature_value_lists])

print_table(rows, features_names, information_names)