from math import exp, log, sqrt, floor, ceil


def get_mean(l):
    """Returns the mean of a list of numbers."""
    return sum(l) / len(l)


def get_min(l):
    """Returns the lowest element of a list of numbers."""
    lowest = l[0]
    for n in l:
        if n < lowest:
            lowest = n
    return lowest


def get_max(l):
    """Returns the biggest element of a list of numbers."""
    highest = l[0]
    for n in l:
        if n > highest:
            highest = n
    return highest


def get_variance(l):
    """Returns the variance of a list of numbers."""
    mean = get_mean(l)
    deviations = [(n - mean) ** 2 for n in l]
    variance = get_mean(deviations)
    return variance


def get_std(l):
    """Returns the standard deviation of a list of numbers."""
    return sqrt(get_variance(l))


def get_quartiles(l):
    """Returns a tuple of the three quartiles of a list of numbers."""
    quartile_limits = [len(l) * 0.25, len(l) * 0.5, len(l) * 0.75]
    sorted_l = sorted(l)
    quartiles = []
    for limit in quartile_limits:
        if int(limit) == limit:  # If limit is a whole number
            q = sorted_l[int(limit)]
        else:
            q = (sorted_l[floor(limit)] + sorted_l[ceil(limit)]) / 2
        quartiles.append(q)
    return tuple(quartiles)


def get_sigmoid(x):
    """Returns the sigmoid of a number."""
    return 1 / (1 + exp(-x))


def get_linear_combination(variables, coefficients):
    """
    Returns the linear combination of a set of variables and a set of coefficients.

    Parameters:
        variables (list): A list of variables
        coefficients (list): A list of coefficients
    Returns:
        float: The sum of each variable weighted by each coefficient
    """
    result = 0.0
    for v, c in zip(variables, coefficients):
        result += v * c
    return result


def get_hypothesis(feature_vector, weight_vector):
    """
    Returns the probability that the output is true given a feature vector and its associated\
    weight vector.

    Parameters:
        feature_vector (list): A list of numbers representing the features

        weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\
        element is the bias term

    Returns:
        float: The probability that the output is true (between 0 and 1)
    """
    h = get_linear_combination(feature_vector, weight_vector[:-1])
    h += weight_vector[-1]  # Bias term
    return get_sigmoid(h)


def get_cost(feature_vector, weight_vector, actual_label):
    """
    Returns the logistic regression cost for a training example.

    Parameters:
        feature_vector (list): A list of numbers representing the features of one training example.

        weight_vector (list): A list of numbers of size `len(feature_vector) + 1` whose last\
        element is the bias term. Represents the weight of each feature for the hypothesis.

        actual_label (int): The actual label of the training example (1 if true or 0 if false).

    Returns:
        float: The log-loss of the prediction for a training example.
    """
    h = get_hypothesis(feature_vector, weight_vector)
    return -actual_label * log(h) - (1 - actual_label) * log(1 - h)


def get_total_cost(feature_matrix, weight_vector, actual_labels):
    """
    Returns the logistic regression cost for a set of training examples.

    Parameters:
        feature_matrix (list): A list of list of numbers representing the features for each\
        training example.

        weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\
        whose last element is the bias term. Represents the weight of each feature for the\
        hypothesis.

        actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\
        false), one for each training example.

    Returns:
        float: The log-loss of the predictions across all training examples.
    """
    total_cost = 0.0
    for feature_vector, actual_label in zip(feature_matrix, actual_labels):
        total_cost += get_cost(feature_vector, weight_vector, actual_label)
    training_example_count = len(feature_matrix)
    return (1 / training_example_count) * total_cost


def get_partial_derivative(hypotheses, actual_labels, column_vector):
    """
    Returns the partial derivative of the cost function for a given parameter.

    Parameters:
        hypotheses (list): A list of numbers representing the vector of predictions for all\
        training examples.

        actual_labels (list): A list of numbers representing the actual labels (1 if true or 0 if\
        false), one for each training example.

        column_vector (list): A list of numbers representing the value of the parameter (feature)\
        for which the partial derivative is to be computed, one for each training example.

    Returns:
        float: The partial derivative of the cost function with respect to the weight associated\
        with the given parameter.
    """
    result = 0.0
    for h, l, f in zip(hypotheses, actual_labels, column_vector):
        result += (h - l) * f
    training_example_count = len(hypotheses)
    return (1 / training_example_count) * result


def gradient_descent(
    feature_matrix, weight_vector, actual_label_vector, learning_rate, iteration_count
):
    """
    Apply the gradient descent algorithm to compute the weight vector that minimizes the cost\
    function.

    Additionally this function prints the cost upgrade during the descent's course.

    Parameters:
        feature_matrix (list): A list of list of numbers representing the features for each\
        training example.

        weight_vector (list): A list of numbers the same size as each list in `feature_matrix`\
        whose last element is the bias term. Represents the weight of each feature for the\
        hypothesis.

        actual_label_vector (list): A list of numbers representing the actual label vector, one for\
        each training example.

        learning_rate (int): The learning rate of the algorithm.

        iteration_count (int): The count of gradient descent updates the algorithm will perform.

    Returns:
        (list): The updated weight vector.
    """
    # Copy the weight vector in order not to modify the original
    updated_weight_vector = weight_vector[:]

    # Get the column vectors of each feature
    feature_count = len(feature_matrix[0])
    column_vectors = [[] for _ in range(feature_count)]
    for feature_vector in feature_matrix:
        for i in range(len(feature_vector)):
            column_vectors[i].append(feature_vector[i])
    # Add a column of features equal to 1 for the bias term partial derivative computation
    column_vectors.append([1 for _ in range(feature_count)])

    # Save initial cost for printing and comparing during the algorithm's course
    old_cost = get_total_cost(feature_matrix, weight_vector, actual_label_vector)

    for i in range(iteration_count):
        # Get the predictions for every training example
        hypotheses = [
            get_hypothesis(feature_vector, updated_weight_vector)
            for feature_vector in feature_matrix
        ]
        # Get the partial derivative of each feature
        gradients = [
            get_partial_derivative(
                hypotheses,
                actual_label_vector,
                column_vector,
            )
            for column_vector in column_vectors
        ]
        # Update the weights
        updated_weight_vector = [
            w - learning_rate * g for w, g in zip(updated_weight_vector, gradients)
        ]
        # Print cost update periodically
        if (i + 1) % (iteration_count / 10) == 0:
            new_cost = get_total_cost(
                feature_matrix, updated_weight_vector, actual_label_vector
            )
            print(
                f"Iteration: {i + 1:>10}, cost: {old_cost:>8.4f} -> {new_cost:>8.4f}, diff: {abs(new_cost - old_cost):>8.6f}"
            )
            old_cost = new_cost

    return updated_weight_vector