aboutsummaryrefslogtreecommitdiffstats
path: root/histogram.py
blob: cbc78ec578d8e05d65199daef47e2e493dc0bc04 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from module.constants import (
    HOUSE_COLORS,
    NUMERICAL_FEATURE_CSV_TITLES,
    HOUSE_FEATURE_CSV_TITLE,
)
from module.dataset_manip import parse_csv
import matplotlib.pyplot as plt
import os
import pandas as pd
import sys

if len(sys.argv) < 2:
    print(f"Usage: python {__file__} <dataset.csv>")
    exit(-1)

# Get data from CSV file
filename = sys.argv[1]
data = parse_csv(filename, NUMERICAL_FEATURE_CSV_TITLES, [HOUSE_FEATURE_CSV_TITLE])

df = pd.DataFrame(data)
# Show a histogram for each numerical feature
for feature in NUMERICAL_FEATURE_CSV_TITLES:
    title = f"{feature} Histogram"
    fig, ax = plt.subplots()
    for house in df[HOUSE_FEATURE_CSV_TITLE].dropna().unique():
        house_df = df.loc[df[HOUSE_FEATURE_CSV_TITLE] == house]
        subset = house_df.loc[:, feature].dropna()
        ax.hist(
            subset,
            bins=20,
            alpha=0.7,
            density=True,
            color=HOUSE_COLORS[house.lower()],
            label=house,
        )
    ax.set_title(title)
    ax.set_xlabel(f"{feature} Score")
    ax.set_ylabel("Probability")
    ax.legend()

    # Save to png file
    os.makedirs("output/histogram", exist_ok=True)
    save_filename = f"output/histogram/{title}.png"
    fig.savefig(save_filename)
    plt.close(fig)
    print(f"Saved {title} to {save_filename}")