aboutsummaryrefslogtreecommitdiffstats
path: root/histogram.py
diff options
context:
space:
mode:
authorThomas Vanbesien <tvanbesi@proton.me>2026-04-01 17:42:04 +0200
committerThomas Vanbesien <tvanbesi@proton.me>2026-04-01 17:42:04 +0200
commit32cd9b2be1763f872c800b17e1fa63f852fe91c1 (patch)
tree8aee9bd7e81d8204faca701c0a852bcf7dc45de6 /histogram.py
downloadDSLR-32cd9b2be1763f872c800b17e1fa63f852fe91c1.tar.gz
DSLR-32cd9b2be1763f872c800b17e1fa63f852fe91c1.zip
Import from github.comHEADmaster
Diffstat (limited to 'histogram.py')
-rw-r--r--histogram.py46
1 files changed, 46 insertions, 0 deletions
diff --git a/histogram.py b/histogram.py
new file mode 100644
index 0000000..cbc78ec
--- /dev/null
+++ b/histogram.py
@@ -0,0 +1,46 @@
+from module.constants import (
+ HOUSE_COLORS,
+ NUMERICAL_FEATURE_CSV_TITLES,
+ HOUSE_FEATURE_CSV_TITLE,
+)
+from module.dataset_manip import parse_csv
+import matplotlib.pyplot as plt
+import os
+import pandas as pd
+import sys
+
+if len(sys.argv) < 2:
+ print(f"Usage: python {__file__} <dataset.csv>")
+ exit(-1)
+
+# Get data from CSV file
+filename = sys.argv[1]
+data = parse_csv(filename, NUMERICAL_FEATURE_CSV_TITLES, [HOUSE_FEATURE_CSV_TITLE])
+
+df = pd.DataFrame(data)
+# Show a histogram for each numerical feature
+for feature in NUMERICAL_FEATURE_CSV_TITLES:
+ title = f"{feature} Histogram"
+ fig, ax = plt.subplots()
+ for house in df[HOUSE_FEATURE_CSV_TITLE].dropna().unique():
+ house_df = df.loc[df[HOUSE_FEATURE_CSV_TITLE] == house]
+ subset = house_df.loc[:, feature].dropna()
+ ax.hist(
+ subset,
+ bins=20,
+ alpha=0.7,
+ density=True,
+ color=HOUSE_COLORS[house.lower()],
+ label=house,
+ )
+ ax.set_title(title)
+ ax.set_xlabel(f"{feature} Score")
+ ax.set_ylabel("Probability")
+ ax.legend()
+
+ # Save to png file
+ os.makedirs("output/histogram", exist_ok=True)
+ save_filename = f"output/histogram/{title}.png"
+ fig.savefig(save_filename)
+ plt.close(fig)
+ print(f"Saved {title} to {save_filename}")