diff options
| author | Thomas Vanbesien <tvanbesi@proton.me> | 2026-04-01 17:42:04 +0200 |
|---|---|---|
| committer | Thomas Vanbesien <tvanbesi@proton.me> | 2026-04-01 17:42:04 +0200 |
| commit | 32cd9b2be1763f872c800b17e1fa63f852fe91c1 (patch) | |
| tree | 8aee9bd7e81d8204faca701c0a852bcf7dc45de6 /histogram.py | |
| download | DSLR-master.tar.gz DSLR-master.zip | |
Diffstat (limited to 'histogram.py')
| -rw-r--r-- | histogram.py | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/histogram.py b/histogram.py new file mode 100644 index 0000000..cbc78ec --- /dev/null +++ b/histogram.py @@ -0,0 +1,46 @@ +from module.constants import ( + HOUSE_COLORS, + NUMERICAL_FEATURE_CSV_TITLES, + HOUSE_FEATURE_CSV_TITLE, +) +from module.dataset_manip import parse_csv +import matplotlib.pyplot as plt +import os +import pandas as pd +import sys + +if len(sys.argv) < 2: + print(f"Usage: python {__file__} <dataset.csv>") + exit(-1) + +# Get data from CSV file +filename = sys.argv[1] +data = parse_csv(filename, NUMERICAL_FEATURE_CSV_TITLES, [HOUSE_FEATURE_CSV_TITLE]) + +df = pd.DataFrame(data) +# Show a histogram for each numerical feature +for feature in NUMERICAL_FEATURE_CSV_TITLES: + title = f"{feature} Histogram" + fig, ax = plt.subplots() + for house in df[HOUSE_FEATURE_CSV_TITLE].dropna().unique(): + house_df = df.loc[df[HOUSE_FEATURE_CSV_TITLE] == house] + subset = house_df.loc[:, feature].dropna() + ax.hist( + subset, + bins=20, + alpha=0.7, + density=True, + color=HOUSE_COLORS[house.lower()], + label=house, + ) + ax.set_title(title) + ax.set_xlabel(f"{feature} Score") + ax.set_ylabel("Probability") + ax.legend() + + # Save to png file + os.makedirs("output/histogram", exist_ok=True) + save_filename = f"output/histogram/{title}.png" + fig.savefig(save_filename) + plt.close(fig) + print(f"Saved {title} to {save_filename}") |
