from module.constants import ( HOUSE_COLORS, NUMERICAL_FEATURE_CSV_TITLES, HOUSE_FEATURE_CSV_TITLE, ) from module.dataset_manip import parse_csv import matplotlib.pyplot as plt import os import pandas as pd import sys if len(sys.argv) < 2: print(f"Usage: python {__file__} ") exit(-1) # Get data from CSV file filename = sys.argv[1] data = parse_csv(filename, NUMERICAL_FEATURE_CSV_TITLES, [HOUSE_FEATURE_CSV_TITLE]) df = pd.DataFrame(data) # Show a histogram for each numerical feature for feature in NUMERICAL_FEATURE_CSV_TITLES: title = f"{feature} Histogram" fig, ax = plt.subplots() for house in df[HOUSE_FEATURE_CSV_TITLE].dropna().unique(): house_df = df.loc[df[HOUSE_FEATURE_CSV_TITLE] == house] subset = house_df.loc[:, feature].dropna() ax.hist( subset, bins=20, alpha=0.7, density=True, color=HOUSE_COLORS[house.lower()], label=house, ) ax.set_title(title) ax.set_xlabel(f"{feature} Score") ax.set_ylabel("Probability") ax.legend() # Save to png file os.makedirs("output/histogram", exist_ok=True) save_filename = f"output/histogram/{title}.png" fig.savefig(save_filename) plt.close(fig) print(f"Saved {title} to {save_filename}")