1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
import sys
from module.constants import NUMERICAL_FEATURE_CSV_TITLES
from module.dataset_manip import parse_csv
from module.math import get_mean, get_std, get_min, get_max, get_quartiles
def print_table(number_rows, h_headers, v_headers, decimal_precision=3):
"""
Prints a table of numbers in a human-readable way on the standard output.
Parameters:
number_rows (list): A list of list of the numbers to print, one list for each row
h_headers (list): A list of strings of the horizontal table headers
v_headers (list): A list of strings of the vertical table headers
decimal_precision (int): The amount of decimal points to display for each number
"""
# Get the column width needed to display the number whose string representation is the widest
# It includes the sign, digits before the decimal point, the decimal point and the decimal
# places
DECIMAL_PART_WIDTH = 1 + decimal_precision # Width of the dot + decimal places
max_column_width = 0
for l in number_rows:
for n in l:
n_width = 1 if n < 0 else 0 # Minus sign width
n_width += len(str(int(n))) # Width of digits before the decimal point
n_width += DECIMAL_PART_WIDTH
if n_width > max_column_width:
max_column_width = n_width
# Truncate headers to fit the maximum column width
truncated_h_headers = [s[:max_column_width] for s in h_headers]
truncated_v_headers = [s[:max_column_width] for s in v_headers]
# Replace last char with "." for each header if it was truncated
for i in range(len(h_headers)):
if len(h_headers[i]) != len(truncated_h_headers[i]):
truncated_h_headers[i] = truncated_h_headers[i][:-1] + "."
for i in range(len(v_headers)):
if len(v_headers[i]) != len(truncated_v_headers[i]):
truncated_v_headers[i] = truncated_v_headers[i][:-1] + "."
# Print the table
print(f"|{'':{max_column_width}}|", end="") # First empty cell
# Print headers
column_count = len(truncated_h_headers) # Not counting header column
for i in range(column_count):
print(f"{truncated_h_headers[i]:>{max_column_width}}|", end="")
print()
# Print rows
row_count = len(v_headers) # Not counting header row
for i in range(row_count):
# Print row header
print(f"|{truncated_v_headers[i]:{max_column_width}}|", end="")
row_numbers = number_rows[i]
for n in row_numbers:
print(f"{n:>{max_column_width}.{decimal_precision}f}|", end="")
print()
if len(sys.argv) < 2:
print(f"Usage: python {__file__} <dataset.csv>")
exit(-1)
# Get data from CSV file
dataset_filename = sys.argv[1]
data = parse_csv(dataset_filename, NUMERICAL_FEATURE_CSV_TITLES)
# Remove None values from each feature vector
for k in data.keys():
data[k] = [v for v in data[k] if v is not None]
# Get horizontal headers
features_names = list(data.keys())
# Get vertical headers
information_names = ["Count", "Mean", "Std", "Min", "25%", "50%", "75%", "Max"]
# Get rows of data
feature_value_lists = data.values()
rows = []
rows.append([len(l) for l in feature_value_lists])
rows.append([get_mean(l) for l in feature_value_lists])
rows.append([get_std(l) for l in feature_value_lists])
rows.append([get_min(l) for l in feature_value_lists])
q1_list, q2_list, q3_list = [], [], []
for l in feature_value_lists:
quartiles = get_quartiles(l)
q1_list.append(quartiles[0])
q2_list.append(quartiles[1])
q3_list.append(quartiles[2])
rows.extend([q1_list, q2_list, q3_list])
rows.append([get_max(a) for a in feature_value_lists])
print_table(rows, features_names, information_names)
|