aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Vanbesien <tvanbesi@proton.me>2026-03-30 17:10:31 +0200
committerThomas Vanbesien <tvanbesi@proton.me>2026-03-30 17:22:03 +0200
commitb998b2cdfe454c9d177e06304c2c01c63747335c (patch)
tree4f55811de78a23dc67ca62a7da052beb47145c85
parentfd5fe70ce5271f09303b51dae34b42acc47f5730 (diff)
downloadft_linear_regression-b998b2cdfe454c9d177e06304c2c01c63747335c.tar.gz
ft_linear_regression-b998b2cdfe454c9d177e06304c2c01c63747335c.zip
Rename train_once to compute_gradients, clean up normalize, document normalization in README
-rw-r--r--README.md26
-rw-r--r--train.py12
2 files changed, 32 insertions, 6 deletions
diff --git a/README.md b/README.md
index 4d6833d..47b3583 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,17 @@ A simple linear regression implementation using gradient descent to predict car
- Python 3
- matplotlib (for visualization only)
+- pandoc (to generate HTML documentation)
```
pip install matplotlib
```
+To generate the HTML version of this README (to see the equations):
+```
+pandoc README.md --mathml -s -o README.html
+```
+
## Usage
### Train the model
@@ -51,4 +57,22 @@ The model fits a linear function:
estimatePrice(mileage) = θ0 + θ1 * mileage
```
-Parameters are found via gradient descent with min-max normalization on the input data. After training, thetas are denormalized so they work directly on raw mileage values.
+Parameters are found via gradient descent. The input data is normalized before training using min-max normalization, and the resulting thetas are denormalized afterward so they work directly on raw mileage values.
+
+### Why normalization?
+
+The two variables have very different scales: mileage ranges from ~22,000 to ~240,000 while prices range from ~3,600 to ~8,300. This causes the gradient for $\theta_1$ (which is multiplied by mileage) to be orders of magnitude larger than the gradient for $\theta_0$. No single learning rate can work well for both parameters simultaneously.
+
+Min-max normalization scales each variable to $[0, 1]$:
+
+$$x_{\text{norm}} = \frac{x - x_{\min}}{x_{\max} - x_{\min}}$$
+
+Without normalization, if you pick a learning rate small enough to prevent $\theta_1$ from overshooting, $\theta_0$ barely moves and needs millions of iterations. If you pick a larger learning rate so $\theta_0$ converges in a reasonable time, $\theta_1$ overshoots, oscillates, and diverges to infinity (NaN).
+
+Normalization brings both gradients to the same scale, allowing gradient descent to converge efficiently with a single learning rate.
+
+After training on normalized data, the thetas are converted back to work on raw values:
+
+$$\theta_1' = \theta_1 \cdot \frac{p_{\max} - p_{\min}}{km_{\max} - km_{\min}}$$
+
+$$\theta_0' = \theta_0 \cdot (p_{\max} - p_{\min}) + p_{\min} - \theta_1' \cdot km_{\min}$$
diff --git a/train.py b/train.py
index a9c865b..8b15547 100644
--- a/train.py
+++ b/train.py
@@ -8,7 +8,7 @@ THETAS_FILE = "thetas.csv"
def normalize(data):
min_val = min(data)
max_val = max(data)
- return [(x - min_val) / (max_val - min_val) for x in data], min_val, max_val
+ return [(x - min_val) / (max_val - min_val) for x in data]
def load_data():
@@ -28,7 +28,7 @@ def estimate_price(mileage, theta0, theta1):
# DV: dependant variable, IV: independant variable
-def train_once(learning_rate, DV, IV, theta0, theta1):
+def compute_gradients(learning_rate, DV, IV, theta0, theta1):
tmp0 = (
learning_rate
* (1.0 / len(DV))
@@ -52,12 +52,14 @@ def denormalize_thetas(t0, t1, km_min, km_max, price_min, price_max):
def train(learning_rate, iterations):
kms, prices = load_data()
- kms_norm, km_min, km_max = normalize(kms)
- prices_norm, price_min, price_max = normalize(prices)
+ km_min, km_max = min(kms), max(kms)
+ price_min, price_max = min(prices), max(prices)
+ kms_norm = normalize(kms)
+ prices_norm = normalize(prices)
t0 = 0.0
t1 = 0.0
for _ in range(iterations):
- grad0, grad1 = train_once(learning_rate, prices_norm, kms_norm, t0, t1)
+ grad0, grad1 = compute_gradients(learning_rate, prices_norm, kms_norm, t0, t1)
t0 -= grad0
t1 -= grad1
return denormalize_thetas(t0, t1, km_min, km_max, price_min, price_max)