diff options
| -rw-r--r-- | README.md | 26 | ||||
| -rw-r--r-- | train.py | 12 |
2 files changed, 32 insertions, 6 deletions
@@ -6,11 +6,17 @@ A simple linear regression implementation using gradient descent to predict car - Python 3 - matplotlib (for visualization only) +- pandoc (to generate HTML documentation) ``` pip install matplotlib ``` +To generate the HTML version of this README (to see the equations): +``` +pandoc README.md --mathml -s -o README.html +``` + ## Usage ### Train the model @@ -51,4 +57,22 @@ The model fits a linear function: estimatePrice(mileage) = θ0 + θ1 * mileage ``` -Parameters are found via gradient descent with min-max normalization on the input data. After training, thetas are denormalized so they work directly on raw mileage values. +Parameters are found via gradient descent. The input data is normalized before training using min-max normalization, and the resulting thetas are denormalized afterward so they work directly on raw mileage values. + +### Why normalization? + +The two variables have very different scales: mileage ranges from ~22,000 to ~240,000 while prices range from ~3,600 to ~8,300. This causes the gradient for $\theta_1$ (which is multiplied by mileage) to be orders of magnitude larger than the gradient for $\theta_0$. No single learning rate can work well for both parameters simultaneously. + +Min-max normalization scales each variable to $[0, 1]$: + +$$x_{\text{norm}} = \frac{x - x_{\min}}{x_{\max} - x_{\min}}$$ + +Without normalization, if you pick a learning rate small enough to prevent $\theta_1$ from overshooting, $\theta_0$ barely moves and needs millions of iterations. If you pick a larger learning rate so $\theta_0$ converges in a reasonable time, $\theta_1$ overshoots, oscillates, and diverges to infinity (NaN). + +Normalization brings both gradients to the same scale, allowing gradient descent to converge efficiently with a single learning rate. + +After training on normalized data, the thetas are converted back to work on raw values: + +$$\theta_1' = \theta_1 \cdot \frac{p_{\max} - p_{\min}}{km_{\max} - km_{\min}}$$ + +$$\theta_0' = \theta_0 \cdot (p_{\max} - p_{\min}) + p_{\min} - \theta_1' \cdot km_{\min}$$ @@ -8,7 +8,7 @@ THETAS_FILE = "thetas.csv" def normalize(data): min_val = min(data) max_val = max(data) - return [(x - min_val) / (max_val - min_val) for x in data], min_val, max_val + return [(x - min_val) / (max_val - min_val) for x in data] def load_data(): @@ -28,7 +28,7 @@ def estimate_price(mileage, theta0, theta1): # DV: dependant variable, IV: independant variable -def train_once(learning_rate, DV, IV, theta0, theta1): +def compute_gradients(learning_rate, DV, IV, theta0, theta1): tmp0 = ( learning_rate * (1.0 / len(DV)) @@ -52,12 +52,14 @@ def denormalize_thetas(t0, t1, km_min, km_max, price_min, price_max): def train(learning_rate, iterations): kms, prices = load_data() - kms_norm, km_min, km_max = normalize(kms) - prices_norm, price_min, price_max = normalize(prices) + km_min, km_max = min(kms), max(kms) + price_min, price_max = min(prices), max(prices) + kms_norm = normalize(kms) + prices_norm = normalize(prices) t0 = 0.0 t1 = 0.0 for _ in range(iterations): - grad0, grad1 = train_once(learning_rate, prices_norm, kms_norm, t0, t1) + grad0, grad1 = compute_gradients(learning_rate, prices_norm, kms_norm, t0, t1) t0 -= grad0 t1 -= grad1 return denormalize_thetas(t0, t1, km_min, km_max, price_min, price_max) |
