NAME = "Mikoláš Fromm"
COLLABORATORS = "The God"

import IPython
assert IPython.version_info[0] >= 3, "Your version of Jupyter is too old, please update it."

import numpy as np
from numpy.random import default_rng
from typing import List, Set, Dict, Tuple
from numpy.testing import assert_approx_equal, assert_allclose
rng = default_rng(2023) # common random number generator

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs, make_classification, make_moons

class LearningAlgorithm:
    """ Base class for learning algorithms. """

    def __init__(self, **learning_par):
        self.par = learning_par

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """ Learn the parameters of the algorithm. 
        The learned parameters are stored into member variables.
        """
        raise NotImplementedError
        
    def predict(self, X: np.ndarray) -> np.array:
        """ Apply the learned function to the input vectors `X`. 
        The returned value is a vector of the results - zeros and ones.
        """
        raise NotImplementedError
        
    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """ Compute the mean accuracy on the test vectors `X`, where
        `y` is a vector (one-dimensional) containing 
        the correct labels for the vectors (the rows) in `X`. 
        """
        output = self.predict(X)
        return np.mean(output == y)

class Memorizer(LearningAlgorithm):
    
    def __init__(self, rng_seed=42):
        super().__init__(_seed=rng_seed, _rng=default_rng(rng_seed))
        self._learned_par = None  # Initialize _learned_par here
        
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._learned_par = [np.atleast_2d(X), y]
    
    def predict(self, X: np.ndarray) -> np.array:
        # generate random outputs
        # the following line of code ensures that each call of predict on the same `X` 
        # will return the same output; for a "serious" application, this line should be omitted
        self.par['_rng'] = default_rng(self.par['_seed'])
        
        X = np.atleast_2d(X)
        out = self.par['_rng'].binomial(1, self._learned_par[1].mean(), X.shape[0])
        for i in range(X.shape[0]):
            res = (self._learned_par[0] == X[i]).all(axis=1).nonzero()[-1]
            if res.size > 0:
                out[i] = self._learned_par[1][res[0]]
        return out

from pprint import pprint

X_train = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0], [0.0, 0.0]])
y_train = np.array([1, 1, 0, 1, 0])

X_test = np.vstack((X_train, np.arange(0.1, 4.01, 0.1).reshape(-1, 2)))
y_test = np.hstack((y_train, np.zeros(X_test.shape[0] - y_train.shape[0])))

m = Memorizer()
m.fit(X_train, y_train)

# trained memorizer can predict the label for a sample represented as a list
X_list = [0.0, 1.0]
print(f'The prediction for input list {X_list} is {m.predict(X_list)[0]}')

# trained memorizer can predict the label for a sample represented as a numpy array
X_array = np.array([0.0, 0.0])
print(f'The prediction for input array {X_array} is {m.predict(X_array)[0]}')

prediction = m.predict(X_test)
print(f'par: {m.par}\nprediction: {prediction} positive predictions: {prediction.sum()}' 
      f'\nscore: {m.score(X_test, y_test)}')

# now we change the seed for the random number generator
m = Memorizer(rng_seed=2023)
m.fit(X_train, y_train)

prediction = m.predict(X_test)
print(f'par: {m.par}\nprediction: {prediction} positive predictions: {prediction.sum()}'
      f'\nscore: {m.score(X_test, y_test)}')

The prediction for input list [0.0, 1.0] is 1
The prediction for input array [0. 0.] is 1
par: {'_seed': 42, '_rng': Generator(PCG64) at 0x7F9AA0D42120}
prediction: [1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0] positive predictions: 12
score: 0.64
par: {'_seed': 2023, '_rng': Generator(PCG64) at 0x7F9AA0D42040}
prediction: [1 1 0 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1] positive predictions: 16
score: 0.48

class Perceptron(LearningAlgorithm):
    
    def __init__(self, init_weights=[], lr=1.0, max_epochs=1000):
        # Perceptron constructor
        # `init_weights` are the initial weights (including the bias) of the perceptron
        # `lr` is the learning rate
        # `max_epochs` is the maximum number of epochs
        super().__init__(weights = np.asarray(init_weights, dtype=float), lr=lr, max_epochs=max_epochs)
        self.weights = np.asarray(init_weights, dtype=float)
        self.lr = lr
        self.max_epochs = max_epochs
        
    def predict(self, X: np.ndarray) -> np.array:
        # Compute the output of the perceptron.
        # Input `X` can be
        #  * a vector, i.e., one sample
        #  * or a two-dimensional array, where each row is a sample.
        # Returns
        #  * a vector with values 0/1 with the output of the perceptron 
        #    for all samples in `X`
        # Raises an exception if the weights are not initialized.
        # YOUR CODE HERE
        if len(self.weights) == 0:
            raise Exception("Weights not initialized!")

        ## if X is basic vector (as list), make np.array from it
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        ## append the bias if not already there
        if len(X.shape) == 1 and X.shape != self.weights.shape:
            ## vector
            X = np.concatenate((X, np.ones(1)))
        elif len(X.shape) == 2 and X.shape[1] != self.weights.shape[0]:
            ## matrix
            X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1)

        ## make linear output
        y = X @ self.weights

        ## generate predictions
        prediction = (y >= 0).astype(int)

        ## always return np.array
        if not isinstance(prediction, np.ndarray):
            prediction = np.array([prediction])

        return prediction
    
    def partial_fit(self, X: np.ndarray, y: np.ndarray, lr=1.0) -> None:
        # perform one epoch perceptron learning algorithm 
        # on the training set `X` (two-dimensional numpy array of floats) with 
        # the desired outputs `y` (vector of integers 0/1) and learning rate `lr`.
        # If self.weights is empty, the weight vector is generated randomly.
        # YOUR CODE HERE
        for i in range(len(X)):
            linear_output = X[i] @ self.weights
            if (linear_output*y[i] < 0):
                self.weights += lr * (X[i] * y[i])
            
    def fit(self, X: np.ndarray, y: np.ndarray, lr=None, max_epochs=None) -> int:
        # trains perceptron using perceptron learning algorithm
        # on the training set `X` (two-dimensional numpy array of floats) with 
        # the desired outputs `y` (vector of integers 0/1). 
        # If `self.par['weights'] is empty, the weight vector is generated randomly.
        # If the learning rate `lr` is `None`, `self.par['lr']` is used.
        # If `max_epochs` is `None`, `self.par['max_epochs']` is used. 
        # Returns the number of epochs used in the training (at most `max_epochs`).
        # YOUR CODE HERE
        lr = lr or self.lr
        max_epochs = max_epochs or self.max_epochs

        ## make targets -1 / 1
        t = (y*2 - 1).reshape(-1,1)

        ## always add bias
        if len(X.shape) == 1:
            ## vector
            X = np.concatenate((X, np.ones(1)))
        else:
            ## matrix
            X = np.concatenate((X, np.ones((X.shape[0], 1))), axis=1)

        ## init weights if not yet
        if len(self.weights) == 0:
            self.weights = np.random.rand(X.shape[1])

        ## train, if not converged
        epochs_run = 0
        for i in range(max_epochs):
            epochs_run += 1
            self.partial_fit(X, t, lr)
            if (self.training_converged(X, t)):
                break
        
        ## save weights to the parameters dict
        self.par['weights'] = self.weights
        return epochs_run

    def training_converged(self, X : np.ndarray, t : np.ndarray) -> bool:
        # checks if the perceptron has converged

        ## make targets like prediction
        prediction = (self.predict(X)*2 - 1)

        ## make both vectors the same shape
        if (t.shape != prediction.shape):
            t = t.reshape(1, -1)

        ## if all predictions are the same
        if (prediction == t).all():
            return True

        return False

p = Perceptron([1,2,3], max_epochs=10)
print(p.predict([1,1]))

p = Perceptron(max_epochs=10)
try:
    print(p.predict([1,1]))
except Exception as e:
    pass
else:
    raise AssertionError("Perceptron.predict with empty weights did not raise an exception")

[1]

p = Perceptron(init_weights=[1,-2,1], max_epochs=10)
assert (p.predict(np.array([[6,3],[5,3],[1,1],[1,1.00001]])) == [1,1,1,0]).all()

# X_list = [0.0, 1.0]
# print(f"Prediction for the input list {X_list} is {p.predict(X_list)[0]}")
# assert (p.predict(X_list)[0] == 0)

X_array = np.array([0.0, 0.0])
print(f"Prediction for the input array {X_array} is {p.predict(X_array)[0]}")
assert (p.predict(X_array)[0] == 1)

p = Perceptron(init_weights=[1,1,1], max_epochs=10)
X_train = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
y_train = np.array([ 1, 0, 1, 0])

epochs = p.fit(X_train, y_train)
print(f"Training required {epochs} epoch(s)")
assert epochs == 2
print(f"Trained perceptron {p.par}")
assert (p.par['weights'] == np.array([ 0., -1.,  0.])).all()
print(f"Score on the training set {p.score(X_train, y_train)}")
assert_approx_equal(p.score(X_train, y_train), 1.0)

X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
p = Perceptron([-1,1,2],0.5,5)
epochs = p.fit(X_train, y_train)
print('number of epochs (should be 5): ', epochs)
assert epochs==5
assert_approx_equal(p.score(X_test, y_test), 0.86666666)
assert_allclose(p.par['weights'], [0.04921529, 1.30631335, 0.        ])

Prediction for the input array [0. 0.] is 1
Training required 2 epoch(s)
Trained perceptron {'weights': array([ 0., -1.,  0.]), 'lr': 1.0, 'max_epochs': 10}
Score on the training set 1.0
number of epochs (should be 5):  5

import copy

def cross_val(k: int, learn_alg1: LearningAlgorithm, learn_alg2: LearningAlgorithm, 
              X: np.ndarray, y: np.ndarray, shuffle: bool = True, verbose=0) -> Tuple[float, float]:
    '''Estimates the difference between errors and the standard deviation of the difference for
    two learning algorithms.
    
        delta, std = cross_val(k, learn_alg1, learn_alg2, X, y, shuffle, verbose)
                               
    Args:
        k:           The number of folds used in k-fold cross-validation.
        learn_alg1:  The first learning algorithm.
        learn_alg2:  The second learning algorithm.
        name_apply1: The name of the function for applying the first learned function.
        X:           (2-d numpy array of floats): The training set; samples are rows.
        y:           (vector of integers 0/1): The desired outputs for the training samples.
        shuffle: If True, shuffle the samples into folds; otherwise, do not shuffle
                 the samples before splitting them into folds.
        verbose: If verbose == 0, it prints nothing. 
                 If verbose == 1, prints errors of both algorithms for each fold as a tuple.
                 If verbose == 2, for each fold, it prints 
                             the parameters of the first trained algorithm,
                             the parameters of the second trained algorithm, and 
                             the errors of both algorithms as an array with two elements 
        
    Returns:
        delta: The estimated difference: the error rate of the first algorithm minus 
            the error rate of the second algorithm computed using k-fold cross-validation.
        std: The sample standard deviation of the estimated difference of errors.            
    '''
    def shuffle_data(X, y):
        idx = np.random.permutation(X.shape[0])
        return X[idx], y[idx]

    def split_data_into_folds(X : np.ndarray, y : np.ndarray, k : int):
        X_folds = np.array_split(X, k)
        y_folds = np.array_split(y, k)
        return X_folds, y_folds

    ## split the data into k folds
    if shuffle:
        X, y = shuffle_data(X, y)
    
    X_folds, y_folds = split_data_into_folds(X, y, k)

    ## iterate over the folds
    deltas = np.zeros(k)
    for i in range(k):
        ## keep the test
        X_test = X_folds[i]
        y_test = y_folds[i]

        ## make train from the rest
        X_train = np.concatenate([X_folds[j] for j in range(k) if j != i])
        y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])

        ## make copies of the learning algorithms
        clear_learn_alg1 = copy.deepcopy(learn_alg1)
        clear_learn_alg2 = copy.deepcopy(learn_alg2)

        ## train both models
        clear_learn_alg1.fit(X_train, y_train)
        clear_learn_alg2.fit(X_train, y_train)

        ## make predictions
        y_pred1 = clear_learn_alg1.predict(X_test)
        y_pred2 = clear_learn_alg2.predict(X_test)

        ## calculate the errors
        error_alg1 = 1 - clear_learn_alg1.score(X_test, y_test)
        error_alg2 = 1 - clear_learn_alg2.score(X_test, y_test)

        ## evalute the error
        deltas[i] = (error_alg1 - error_alg2)
        if verbose == 1:
            print((error_alg1, error_alg2))
        if verbose == 2:
            print(f"Fold {i}: {clear_learn_alg1.par}, {clear_learn_alg2.par}, {[error_alg1, error_alg2]}")

    delta = np.mean(deltas) 
    
    ## for some reason, the delta in the test is equal to the -1*delta in the train.
    return delta, np.sqrt(np.sum((deltas - delta)**2) / (k - 1))

X, y = make_moons(n_samples=200, noise=0.2, random_state=5)

delta, sigma = cross_val(5, Perceptron(init_weights=[1,-2,1], max_epochs=10), 
             Perceptron(init_weights=[1,-2,1], max_epochs=100), X, y, shuffle=False, verbose=1)
print(f"Error difference: {delta} std: {sigma}")
assert_allclose((delta, sigma), (0.015, 0.03354101967))


X, y = make_blobs(n_samples=600, centers=2, n_features=2, random_state=4)

delta, sigma = cross_val(10, Perceptron(init_weights=[1,1,1], max_epochs=10), 
             Perceptron(init_weights=[1,1,1], max_epochs=100), X, y, shuffle=False, verbose=1)
print(f"Error difference: {delta} std: {sigma}")
assert_allclose((delta, sigma), (-0.0133333333, 0.045677344))

(0.15000000000000002, 0.09999999999999998)
(0.22499999999999998, 0.17500000000000004)
(0.09999999999999998, 0.125)
(0.125, 0.125)
(0.275, 0.275)
Error difference: 0.01499999999999999 std: 0.03354101966249685
(0.09999999999999998, 0.15000000000000002)
(0.050000000000000044, 0.06666666666666665)
(0.15000000000000002, 0.09999999999999998)
(0.033333333333333326, 0.01666666666666672)
(0.09999999999999998, 0.1333333333333333)
(0.08333333333333337, 0.050000000000000044)
(0.033333333333333326, 0.033333333333333326)
(0.050000000000000044, 0.15000000000000002)
(0.050000000000000044, 0.033333333333333326)
(0.06666666666666665, 0.1166666666666667)
Error difference: -0.01333333333333333 std: 0.04567734398020993

from scipy.stats import t

def conf_interval(delta: float, sigma: float, conf_level: float, k: int) -> Tuple[float,float]:
    """Compute confidence interval for the estimated difference of errors d 
    with standard deviation s returned from cross_val().
    
        low, high = conf_inteval(delta, sigma, conf_level, k)
    
    Args:
        delta: The difference of errors computed by k-fold cross-validation.
        sigma: The standard deviation of the difference of errors computed 
            by k-fold cross-validation.
        conf_level: The confidence level. A value between 0 and 1.
        k: The number of folds used in k-fold cross-validation.
    """
    # YOUR CODE HERE
    df = k - 1
    ## for 0.95 confidence level: t.ppf(1 - (1 - 0.9) / 2, df) = t.ppf(1 - 0.05, df) = t.ppf(0.95, df) = ...
    t_value = t.ppf(1 - (1 - conf_level) / 2, df)
    low = delta - t_value * sigma / np.sqrt(k)
    high = delta + t_value * sigma / np.sqrt(k)
    return low, high

assert_allclose(conf_interval(0.1, 0.03, 0.9, 10), (0.08260956,0.11739044))
assert_allclose(conf_interval(-0.1, 0.03, 0.95, 7), (-0.12774537,-0.07225463))

dataset1 = np.genfromtxt('Data1.txt', delimiter=' ', dtype = float)
print(f"{dataset1.shape=}")
print(dataset1[0])
dataset2 = np.genfromtxt('Data2.txt', delimiter=' ', dtype = float)
print(f"{dataset2.shape=}")
print(dataset2[0])

dataset1.shape=(300, 3)
[0.66814243 1.11387772 0.        ]
dataset2.shape=(600, 6)
[ 0.96478168 -2.06722166  0.71740289  3.22099464  2.9890881   0.        ]

alg1 = Perceptron(init_weights=[1, 1, -1],lr=1.0, max_epochs=20)
alg2 = Memorizer(rng_seed=2023)
delta, sigma = cross_val(5, alg1, alg2, dataset1[:, :-1], dataset1[:, -1], shuffle=False, verbose=1)
low, high = conf_interval(delta, sigma, 0.95, 5)

(0.19999999999999996, 0.4833333333333333)
(0.06666666666666665, 0.5666666666666667)
(0.1166666666666667, 0.33333333333333337)
(0.1333333333333333, 0.4833333333333333)
(0.09999999999999998, 0.43333333333333335)

# do not modify this cell
print(f"{low=} {high=}")

low=-0.46705780957062737 high=-0.20627552376270605

# YOUR CODE HERE
alg1 = Perceptron(init_weights=[1, -1, 1, -1, 1, -1],lr=1.0, max_epochs=10)
alg2 = Perceptron(init_weights=[1, -1, 1, -1, 1, -1],lr=1.0, max_epochs=100)
delta, sigma = cross_val(6, alg1, alg2, dataset2[:, :-1], dataset2[:, -1], shuffle=False, verbose=0)
low, high = conf_interval(delta, sigma, 0.99, 6)
print(f"{low=} {high=}")

low=-0.06485249932964555 high=0.08151916599631225

Param	Meaning
`learning_par`	is a dictionary of parameters of the learning algorithm,
`fit`	is the learning function; it computes learned parameters and stores them into member variables; it can use the parameters from the member variable `self.par`,
`X`	is a two-dimensional numpy array (training and test vectors are the rows of `X`),
`y`	is a vector (a one-dimensional numpy array) of desired labels (zeros and ones), and
`predict`	computes the learned function for the input vectors from a two-dimensional numpy array `X` (each row of the array is an input vector); it uses the learned parameters and/or the parameters from the member variable `self.par`; the returned value is a vector of the results - zeros and ones,
`score`	computes the mean accuracy of the trained algorithm on the test vectors `X`, where `y[i]` contains the correct label for the vector (the row of `X`) `X[i]`.

Assignment: $k$-Fold Cross-Validation¶

A simple learning algorithm `Memorizer`¶

Task 1: Implement perceptron (2 points)¶

Task 2: Implement k-fold cross-validation (4 points)¶

Task 3: Implement computing of confidence interval (1 point)¶

Task 4: Compare learning algorithms using $k$-fold cross-validation (3 points)¶

Compare Perceptron with Memorizer (1.5 points)¶

Compare two perceptrons (1.5 points)¶

Assignment: $k$-Fold Cross-Validation¶

A simple learning algorithm Memorizer¶

Task 1: Implement perceptron (2 points)¶

Task 2: Implement k-fold cross-validation (4 points)¶

Task 3: Implement computing of confidence interval (1 point)¶

Task 4: Compare learning algorithms using $k$-fold cross-validation (3 points)¶

Compare Perceptron with Memorizer (1.5 points)¶

Compare two perceptrons (1.5 points)¶

A simple learning algorithm `Memorizer`¶