train.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

import csv
import sys

DATASET = "data.csv"
THETAS_FILE = "thetas.csv"


def normalize(data):
    min_val = min(data)
    max_val = max(data)
    return [(x - min_val) / (max_val - min_val) for x in data], min_val, max_val


def load_data():
    km = []
    price = []
    with open(DATASET) as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            km.append(float(row[0]))
            price.append(float(row[1]))
    return km, price


def estimate_price(mileage, theta0, theta1):
    return theta0 + theta1 * mileage


# DV: dependant variable, IV: independant variable
def train_once(learning_rate, DV, IV, theta0, theta1):
    tmp0 = (
        learning_rate
        * (1.0 / len(DV))
        * sum(estimate_price(x, theta0, theta1) - y for x, y in zip(DV, IV))
    )
    tmp1 = (
        learning_rate
        * (1.0 / len(DV))
        * sum((estimate_price(x, theta0, theta1) - y) * x for x, y in zip(DV, IV))
    )
    return tmp0, tmp1


def denormalize_thetas(t0, t1, km_min, km_max, price_min, price_max):
    price_range = price_max - price_min
    km_range = km_max - km_min
    real_t1 = t1 * price_range / km_range
    real_t0 = t0 * price_range + price_min - real_t1 * km_min
    return real_t0, real_t1


def train(learning_rate, iterations):
    kms, prices = load_data()
    kms_norm, km_min, km_max = normalize(kms)
    prices_norm, price_min, price_max = normalize(prices)
    t0 = 0.0
    t1 = 0.0
    for _ in range(iterations):
        grad0, grad1 = train_once(learning_rate, prices_norm, kms_norm, t0, t1)
        t0 -= grad0
        t1 -= grad1
    return denormalize_thetas(t0, t1, km_min, km_max, price_min, price_max)


def save_thetas(theta0, theta1):
    with open(THETAS_FILE, "w") as f:
        f.write(f"{theta0}\n{theta1}\n")


learning_rate = float(sys.argv[1])
i = int(sys.argv[2])
t0, t1 = train(learning_rate, i)
save_thetas(t0, t1)
print(f"θ0={t0}, θ1={t1} saved to {THETAS_FILE}")