### Original code by Harshit Agrawal @ GeeksForGeeks
def TowerOfHanoi(n , from_rod, to_rod, aux_rod, counter = 0):
    # The code is interestingly reminiscent of the proof that ToH can be completed in 2^n - 1 moves. That
    # is, we can see that there are two lines of code (i.e. the two remaining rods) where the n - 1 discs have to 
    # switch around (the recursion) to let the final disk move from rod 1 to rod 3. 
    if n == 0:
        return
    TowerOfHanoi(n-1, from_rod, aux_rod, to_rod)
    print("Move disk",n,"from rod",from_rod,"to rod",to_rod)
    TowerOfHanoi(n-1, aux_rod, to_rod, from_rod)

n = 3
TowerOfHanoi(n, 'A', 'C', 'B')
# A, C, B are the name of rods

Move disk 1 from rod A to rod C
Move disk 2 from rod A to rod B
Move disk 1 from rod C to rod B
Move disk 3 from rod A to rod C
Move disk 1 from rod B to rod A
Move disk 2 from rod B to rod C
Move disk 1 from rod A to rod C

import time
import numpy as np
import h5py
import matplotlib.pyplot as plt
import scipy
from PIL import Image
from scipy import ndimage
import torch                     # you need to have installed pytorch via pip (check https://www.geeksforgeeks.org/install-pytorch-on-windows/) or 
                                 # I prefer installing anaconda and then running 'conda install -c pytorch pytorch'
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# For CNN
from torch.nn import Module
from torch.nn import Conv2d
from torch.nn import Linear
from torch.nn import MaxPool2d
from torch import flatten

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2


### 1. Load the data
# Training instances
train_dataset = h5py.File('train_catvnoncat.h5', "r")
train_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_y = np.array(train_dataset["train_set_y"][:]) # your train set labels
train_y = train_y.reshape((1, train_y.shape[0]))
# Testing instances
test_dataset = h5py.File('test_catvnoncat.h5', "r")
test_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_y = np.array(test_dataset["test_set_y"][:]) # your test set labels
test_y = test_y.reshape((1, test_y.shape[0]))
# Class labels for visualization
classes = np.array(test_dataset["list_classes"][:]) # the list of classes

# Data preprocessing so that PyTorch can work with numpy arrays

tensor_train_x = torch.from_numpy(train_x_orig)
tensor_train_y = torch.from_numpy(train_y)
tensor_test_x = torch.from_numpy(test_x_orig)
tensor_test_y = torch.from_numpy(test_y)

tensor_train_x_flatten = torch.from_numpy(train_x_orig.reshape(train_x_orig.shape[0], -1)/255)
tensor_train_y_flatten = torch.from_numpy(train_y.reshape(train_y.shape[0], -1).T)
tensor_test_x_flatten = torch.from_numpy(test_x_orig.reshape(test_x_orig.shape[0], -1)/255)
tensor_test_y_flatten = torch.from_numpy(test_y.reshape(test_y.shape[0], -1).T)

# Data visualization
image_idx = 5        # You can choose your own number between 0 - 208
plt.imshow(tensor_train_x[image_idx,])
print("The label is {0}".format(classes[tensor_train_y_flatten[image_idx].item()]))


### 2. Model definition
class ConvNet(nn.Module):
    '''
    Simple Convolutional Neural Network for cat image recognition.
    It's quiet unorthodox to use a CNN for binary classification,
    but we'll see that it serves to defend the idea that image classification
    is better tackled using an AI strategy. 
    The code is adapted from: https://coderzcolumn.com/tutorials/artifical-intelligence/pytorch-convolutional-neural-networks
    '''
    def __init__(self):
        super(ConvNet, self).__init__()
        self.seq = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=24, kernel_size=5, padding = 1, stride = 1),
            # the ReLu activation function led to gradient vanishing. I guess it's due to 
            # dataset (209 training instances) being too small.  
            nn.Tanh(),
            # pooling is for down-scaling values which ensure robustness to orientation and scale variance
            nn.MaxPool2d(kernel_size = 2, stride = 1, padding = 1), 
            # Dropout is to prevent overfitting. Earlier versions of the code achieved 100% training
            # accuracy with ~70% validation accuracy
            nn.Dropout(), 
            
            nn.Conv2d(in_channels=24, out_channels=12, kernel_size=3, padding = 1, stride = 1),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size = 2, stride = 1, padding = 1),
            nn.Dropout(), 
            
            nn.Conv2d(in_channels=12, out_channels=6, kernel_size=3, padding = 1, stride = 1),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size = 2, stride = 1, padding = 1),
            nn.Dropout(),
            
            nn.Flatten(),
            # the last layer contains 25350 perceptrons. The number emerges as follows
            # There are originally (64 + 3)^2 * 6 = 26934 perceptrons, but 
            # 26934 - 1584 randomly dropped out perceptrons
            nn.Linear(in_features = 25350, out_features = 1),  
            # The output will be passed through the Sigmoid function because it's 
            # binary classification. 
            nn.Sigmoid()         
        )
    def forward(self, x_batch):
        preds = self.seq(x_batch)
        return preds

The label is b'non-cat'

### 3. Model initialization and hyperparameter choice
torch.manual_seed(1)                                        # for reproducibility
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'   # speeds things up if you have a GPU
#device = 'cpu'
conv_net = ConvNet()
conv_net.to(device)
loss_criterion = nn.BCELoss()                                # loss criterion for binary classification
optimizer = torch.optim.Adam(conv_net.parameters(), lr=0.001)# Adam optimizer with arbitrary low learning rate.
                                                             # A higher lr could lead to gradient vanishing. 
# We'll perform mini-batch gradient descent
batch_size = 24                                              # arbitrary, you can modify it
epochs = 100                                              # arbitrary, you can modify it
# There is an additional preprocessing step to ensure the number of channels is coherent
# with the model definition relevant for training and testing. 
tensor_train_x_cnn = tensor_train_x.permute(0, 3, 1, 2).float()
tensor_test_x_cnn = tensor_test_x.permute(0, 3, 1, 2).float()


### 4. Training model
for _ in range(epochs): 
    batches = torch.arange((tensor_train_x_cnn.shape[0]//batch_size)+1) ### Batch Indices
    # Mini-batch gradient descent
    for batch in batches:
        if batch != batches[-1]:
            start, end = int(batch*batch_size), int(batch*batch_size+batch_size)
        else:
            start, end = int(batch*batch_size), None
        X_batch, Y_batch = tensor_train_x_cnn[start:end], tensor_train_y_flatten[start:end] ## Single batch of data
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        # Forward pass
        preds = conv_net(X_batch) 
        # Compute loss
        loss = loss_criterion(preds, Y_batch.float()) ## Calculate Loss
        # Optimize
        optimizer.zero_grad() ## Zero weights before calculating gradients
        loss.backward()       ## Calculate Gradients
        optimizer.step()      ## Update Weights
print("The model was able to minimize the error up to: ", loss)

The model was able to minimize the error up to:  tensor(0.0032, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)

torch.manual_seed(1) 
### 5. Model evaluation
train_output = conv_net(tensor_train_x_cnn.to(device)) 
train_accuracy = sum((train_output > 0.5) == tensor_train_y_flatten.to(device)\
                    )/tensor_train_y_flatten.shape[0]
test_output = conv_net(tensor_test_x_cnn.to(device))
test_accuracy = sum((test_output > 0.5) == tensor_test_y_flatten.to(device)\
                   )/tensor_test_y_flatten.shape[0]
print("The model can achieve a {0}% accuracy in the training set, while a {1}% accuracy in the testing set \n"
      .format(round(train_accuracy.item()*100,2), round(test_accuracy.item()*100, 2)))

### 6. We can check which images were misclassified for fun :)
test_output[((test_output.to(device) > 0.5) == tensor_test_y_flatten.to(device)) == False]
idx = 1
plt.imshow(tensor_test_x[idx, ])
print("This image was classified as {}, when the correct label was {}"
      .format(test_output[idx].item() > 0.5, tensor_test_y_flatten[idx].item() == 1) )

The model can achieve a 99.52% accuracy in the training set, while a 62.0% accuracy in the testing set 

This image was classified as True, when the correct label was True

for name, parameter in conv_net.named_parameters():
    
    #print("weight" in name)
    print(name)
    if parameter.requires_grad: 
        print("yes")
        print(torch.abs(parameter.grad.mean()))

seq.0.weight
yes
tensor(0.0018, device='cuda:0')
seq.0.bias
yes
tensor(1.0082e-05, device='cuda:0')
seq.4.weight
yes
tensor(0.0003, device='cuda:0')
seq.4.bias
yes
tensor(0.0013, device='cuda:0')
seq.8.weight
yes
tensor(0.0001, device='cuda:0')
seq.8.bias
yes
tensor(0.0008, device='cuda:0')
seq.13.weight
yes
tensor(0.0004, device='cuda:0')
seq.13.bias
yes
tensor(0.0026, device='cuda:0')

from sklearn.linear_model import LogisticRegression

### We will work with the original numpy arrays without converting them to tensors this time
### But we'd still flatten them and normalize them given that log.reg models don't accent
### 4D input
train_x_orig_lr = train_x_orig.reshape(train_x_orig.shape[0], -1)/255    
train_y_lr = train_y.reshape(train_y.shape[0], -1).T
test_x_lr = test_x_orig.reshape(test_x_orig.shape[0], -1)/255
test_y_lr = test_y.reshape(test_y.shape[0], -1).T

lr_cat = LogisticRegression()
lr_cat.fit(train_x_orig_lr, train_y_lr)
training_acc_lr = lr_cat.score(train_x_orig_lr, train_y_lr)
test_acc_lr = lr_cat.score(test_x_lr, test_y_lr)
print("The model can achieve a {0}% accuracy in the training set, while a {1}% accuracy in the testing set \n"
      .format(round(training_acc_lr*100,2), round(test_acc_lr*100, 2)))
print("The model parameters are {0}, where we can pick a random parameter and say that a unit increase \
of a feature pixel is the expected unit change in log odds of the outcome."
      .format(lr_cat.coef_))

C:\Users\86136\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)

The model can achieve a 100.0% accuracy in the training set, while a 72.0% accuracy in the testing set 

The model parameters are [[ 0.01848942 -0.04769157 -0.01966443 ... -0.02367532 -0.04662887
   0.05457665]], where we can pick a random parameter and say that a unit increase of a feature pixel is the expected unit change in log odds of the outcome.

C:\Users\86136\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, log_loss
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### 1. Data loading and descriptive statistics
cad_orig = pd.read_csv("Coronary_artery.csv")
cad_data = pd.read_csv("data.csv") # The same as "cad_orig", but with the factors converted from
                                   # strings to numbers. E.g. male -> 0, female -> 1
cad_data.drop(cad_data.columns[[0]], axis=1, inplace=True) # drop the first column since it consists only of
                                                           # each participant's ID number
### 1.1 Data cleaning
### There exists participants whose measurements of ca and thal are marked as '?', 
### so they'll be removed to avoid some errors below in running the models
### We're ultimately left with 297 participants
index_ca = cad_data[cad_data['ca'] == '?'].index
cad_data.drop(index_ca, inplace=True)    
index_thal = cad_data[cad_data['thal'] == '?'].index
cad_data.drop(index_thal, inplace=True)  
cad_data.dropna(inplace = True)
print("A sample of the dataset shows")
cad_data.head(5)

A sample of the dataset shows

print("Descriptive statistics show:")
cad_data.describe()

Descriptive statistics show:

correlation_mat = cad_data.corr()
plt.figure(figsize=(20,20))
ax=sns.heatmap(correlation_mat, annot = True)
plt.show(ax)

# sns.pairplot(cad_data, hue="class", diag_kind="hist")

m = nn.Softmax()
input_m = torch.randn(2, 3)
output = m(input_m)
output

<ipython-input-9-a3e6e306f5b0>:3: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  output = m(input_m)

tensor([[0.2103, 0.5198, 0.2699],
        [0.2325, 0.2464, 0.5210]])

### 1.2 Splitting data and conversion to tensor
X_train, X_test, y_train, y_test = train_test_split(cad_data.drop("class", axis=1), cad_data["class"],
                                                    train_size=0.7, test_size=0.3, random_state=0)
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = \
torch.from_numpy(np.vstack(X_train.values).astype(np.float)),\
torch.from_numpy(np.vstack(X_test.values).astype(np.float)),\
torch.from_numpy(np.vstack(y_train.values).astype(np.float)), \
torch.from_numpy(np.vstack(y_test.values).astype(np.float))

y_train_tensor = torch.flatten(y_train_tensor.long())
y_test_tensor = torch.flatten(y_test_tensor.long())

### 2. Building the PyTorch Deep Learning model
torch.manual_seed(1)
class Net(nn.Module):
    '''
    We will emulate Coursera's deep learning model using PyTorch. The original 
    code is attached below in the Appendix Section. Please don't look if you haven't
    completed the specialization.
    In the DL course, we were asked to implement a 5-layers model 
    with the following structure = input -> relu -> relu -> relu -> sigmoid (output)
    '''
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(13, 50)  # The input size must match the amount of features. 
        self.fc2 = nn.Linear(50, 25)  # The number of perceptrons was chosen arbitrarily 
        self.fc3 = nn.Linear(25, 10)  # to make it look complex.
        self.fc4 = nn.Linear(10, 5)    # Last layer must have 5 outputs for the 5 classes to be 
                                      # be predicted by softmax, where
                                      # the max prob would denote the network's prediction

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim = 1)
### 3. Initializing the model and defining hyperparameters
my_net = Net()
loss_criterion_dl = nn.CrossEntropyLoss()  # for multi-class classification
optimizer_dl = torch.optim.Adam(my_net.parameters(), lr=0.001)
### Running the deep learning model for cat recognition
iterations = 1000                                    
for epoch in range(iterations): 
    my_net.zero_grad()  
    # Forward pass
    output = my_net(X_train_tensor.float())  
    loss = loss_criterion_dl(output, y_train_tensor)  
    # Stochastic gradient descent for weight adjustments
    loss.backward()  
    optimizer_dl.step()  
print("The model was able to minimize the error up to: ", loss) 

### Model evaluation
train_output = my_net(X_train_tensor.float())  
train_accuracy = sum(nn.Softmax(1)(train_output).argmax(1) == y_train_tensor)/y_train_tensor.shape[0]
train_accuracy = train_accuracy.item()
test_output = my_net(X_test_tensor.float())  
test_accuracy = sum(nn.Softmax(1)(X_test_tensor).argmax(1) == y_test_tensor)/y_test_tensor.shape[0]
test_accuracy = test_accuracy.item()
print("The model can achieve a {0}% accuracy in the training set, while a {1}% accuracy in the testing set \n"
      .format(round(train_accuracy*100,2), round(test_accuracy*100, 2)))
### comment: About ~250 lines of code of deep learning from the Coursera specialization
### was simplified through ~50 lines of DNN using PyTorch. It was fun though.

The model was able to minimize the error up to:  tensor(0.4472, grad_fn=<NllLossBackward>)
The model can achieve a 84.54% accuracy in the training set, while a 5.56% accuracy in the testing set

### Logistic regression model running the CAD dataset
from sklearn.linear_model import LogisticRegression
np.random.seed(1)
### 1. Load data
X_train, X_test, y_train, y_test = train_test_split(cad_data.drop("class", axis=1), cad_data["class"],
                                                   train_size=0.7, test_size=0.3, random_state=0)
### 2. Initialize model 
lr_cad = LogisticRegression(multi_class='multinomial')
### 3. Fit model
lr_cad.fit(X_train, y_train)
### 4. Evaluate model
training_acc_lr = lr_cad.score(X_train, y_train)
test_acc_lr = lr_cad.score(X_test, y_test)
print("The multinomial LogReg model can achieve a {0}% accuracy in the training set, while a {1}% accuracy in the testing set \n"
      .format(round(training_acc_lr*100,2), round(test_acc_lr*100, 2)))
print("There are {1} model parameters, which are are {0}, where we can pick a random parameter and say that a unit increase \
of a feature is the expected unit change in log odds of the outcome."
      .format(lr_cad.coef_, lr_cad.coef_.shape[0] * lr_cad.coef_.shape[1] ))

The multinomial LogReg model can achieve a 60.87% accuracy in the training set, while a 58.89% accuracy in the testing set 

There are 65 model parameters, which are are [[ 2.39301738e-02 -1.03061946e-01 -2.47418316e-01 -2.35785151e-02
  -1.39543167e-03  1.09133566e-02 -4.03659032e-02  5.29101246e-02
  -1.23536211e-01 -2.22637393e-01 -7.71379136e-02 -2.05627430e-01
  -5.61176488e-01]
 [-1.50238146e-02  3.99877695e-02  3.52649069e-02 -3.65654646e-03
   6.80972195e-04 -3.16127153e-02 -1.40256724e-02  8.99341750e-03
   4.33511730e-02 -5.36037256e-02  1.25803942e-02 -2.94317585e-02
   8.64128143e-02]
 [ 3.25608501e-02  3.74101668e-02  8.29935055e-02 -1.05713470e-02
   1.45558810e-03 -2.44598191e-03 -7.69949869e-02 -1.59133508e-02
   4.76870717e-02  1.39895559e-01  2.71667135e-02  3.82662493e-02
   1.75143462e-01]
 [-2.77738643e-02  2.46901574e-02  9.86667184e-02  8.85909984e-03
   9.86808738e-05  3.04585898e-02  8.44704125e-02 -1.05306866e-02
   4.40649888e-02  9.05611643e-02  2.08036631e-02  1.43457832e-01
   2.10378988e-01]
 [-1.36933450e-02  9.73852209e-04  3.04931850e-02  2.89473086e-02
  -8.39809501e-04 -7.31324926e-03  4.69161500e-02 -3.54595046e-02
  -1.15670230e-02  4.57843950e-02  1.65871427e-02  5.33351070e-02
   8.92412243e-02]], where we can pick a random parameter and say that a unit increase of a feature is the expected unit change in log odds of the outcome.

C:\Users\86136\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

import statsmodels.api as sm
X_train, X_test, y_train, y_test = train_test_split(cad_data.drop("class", axis=1), cad_data["class"],
                                                   train_size=0.7, test_size=0.3, random_state=0)
X_train = X_train.values.astype("float64")
y_train = y_train.values.astype("float64")
X_test = X_test.values.astype("float64")
y_test = y_test.values.astype("float64")
mn_logit_model = sm.MNLogit(y_train, X_train)
fit_mn_logit_model = mn_logit_model.fit()
fit_mn_logit_model.summary()

Optimization terminated successfully.
         Current function value: 0.796120
         Iterations 9

### SARSA AGENT THAT PLAYS TOWER OF HANOI. TEMPLATE CODE ADAPTED FROM 
### https://github.com/clarisli/RL-Easy21
import numpy as np
import random
import math 
from enum import Enum
from itertools import permutations
from copy import copy

class Environment:
    '''
    Intuitive environment for Tower of Hanoi in which
    the RL agent will learn to play
    '''
    def __init__(self, discs):
        """
        The 0 rod means there isn't any rod. I added it in order to 
        take into account the possibility of anction popping from an empty list.
        """
        self.rods = {0:[], 1:[], 2:[]}
        self.state = State(self.rods)
        self.discs = discs

    def init_state(self):
        self.state.rods = {0:[], 1:[], 2:[]}
        self.state.rods[0] += [i for i in reversed(range(self.discs))] 
        self.state.is_win = False
        self.state.num_steps = 0
        return self.state

    def moveFrom(self, myRod):
        if len(myRod) == 0:
            return []
        else:
            return [myRod.pop()]

    def step(self, state, action):
        """
        KEY FUNCTION
        Arguments:
        state: gives access to current states of three rods
        action: tuple of size 2 (a, b), where a represents
        the rod from which a disk is moved, b the rod moved to.
        Returns:
        next_state
        reward: depending on whether there was an illegal move, 
        reached the goal, or otherwise made a normal step
        """
        next_state = copy(state)
        next_state.rods[action[1]] += self.moveFrom(next_state.rods[action[0]])
        #next_state.rods[action[1]].append(next_state.rods[action[0]].pop())  # append disc to rod 1 popped from rod 0
        next_state.num_steps += 1  # to check that agent learns to solve TOH in 2^N - 1 steps
        if self.is_illegal(next_state.rods[action[1]]):
            next_state.rods[action[0]].append(state.rods[action[1]].pop())  # reverse the illegal move
            return next_state, Reward.REVERSE
        elif self.is_victory(next_state):
            next_state.is_win = True
            return next_state, Reward.GOAL

        return next_state, Reward.STEP
        
    def is_illegal(self, rod):
        """
        For example, 0: [5, 4, 3, 2, 0, 1] is illegal 
        """
        if (len(rod) == 1) or (len(rod) == 0):
            return False
        elif rod[-1] > rod[-2]:
            return True 
    def is_victory(self, state):
        return state.rods[2] == [i for i in reversed(range(self.discs))]

class Reward (Enum):
    STEP = -1
    GOAL = 10
    REVERSE = -50

class State:
    def __init__(self, rods, is_win = False):
        self.rods = rods
        self.is_win = is_win
        self.num_steps = 0  # to check if agent completed in 2**discs - 1 steps

class Action:
    """
    A field of this class gets all the possible moves that 
    the rod can move from, to.
    
    """
    def __init__(self):
        self.all_actions = list(permutations([0, 1, 2], 2))
        # all actions are [(0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
        self.num_actions = len(self.all_actions)
        self.enum = list(enumerate(self.all_actions)) # for filling the Q(S,A) matrix
        
class RLAgent:
    def __init__(self, environment, actions, N0=100, discount_factor=1, _lambda=0.1):
        self.env = environment
        self.actions = actions
        self.N0 = N0
        self.discount_factor = discount_factor
        self._lambda = _lambda
        self.Q = self._init_tenor()
        self.returns_count = self._init_tenor()

    def _init_tenor(self):
        """
        initializing Q(S, A)
        O(6n^3)
        Added + 1 to avoid zero index error or out of bounds
        """
        return np.zeros((self.env.discs + 1, self.env.discs + 1, self.env.discs + 1, self.actions.num_actions))  
        # 2 * 3C2 = 12, 12 possible actions, e.g, move from rod 0 to rod 2 and viceversa

    def _get_alpha(self):
        return 0.5 # for learning rate or stepsize update 

    def _get_epsilon(self):
        return 0.1 # for exploratory moves

    
    def train(self, num_episodes=10):
        for e in range(num_episodes):
            E = self._init_tenor()  
            state = self.env.init_state()
            action = self._policy(state)
            action_idx = self.actions.all_actions.index(action)
            while not state.is_win:  # play the game greedily until reaching the end state blindly at the beginning, 
                                     # and slowly getting better at choosing which actions.
                self.returns_count[len(state.rods[0])][len(state.rods[1])][len(state.rods[2])][action_idx] += 1
                next_state, reward = self.env.step(state, action)
                print(reward.value)
                print(next_state.rods)
                next_action = self._policy(next_state) # there is no action anymore if you've won
                print(next_action)
                
                td_delta = self._td_delta(state, action_idx, next_state, next_action, reward)
                E[len(state.rods[0])][len(state.rods[1])][len(state.rods[2])][action_idx] += 1
                self.Q += self._get_alpha()*td_delta*E
                E = self.discount_factor*self._lambda*E
                state = next_state
                action = next_action
                
            if e % 10 == 0:
                print("\rEpisode {}/{}.".format(e, num_episodes), end="")


        return self.Q

    def _policy(self, state):
        """
        Returns:
        tuple(a, b)
        """
        if state.is_win: return None
        if random.random() < self._get_epsilon():
            return self._get_random_action()
        else:
            epsilon = self._get_epsilon()
            state_actions = np.ones(self.actions.num_actions, dtype=float) * epsilon/self.actions.num_actions
            greedy_action = np.argmax(self.Q[len(state.rods[0])][len(state.rods[1])][len(state.rods[2])])
            state_actions[greedy_action] += (1.0 - epsilon)
            action = np.random.choice(np.arange(len(state_actions)), p=state_actions)
            return self.actions.all_actions[action]

    def _get_random_action(self):
        """
        https://stackoverflow.com/questions/66727094/how-to-randomly-choose-a-tuple-from-a-list-of-tuples
        Returns:
        Tuple(a,b)
        """
        indeces = [i for i in range(self.actions.num_actions)]
        rand_action =  self.actions.all_actions[np.random.choice(indeces)]
        return rand_action
    
    
    def _td_delta(self, s, a, next_s, next_a, r):
        if next_s.is_win:
            return r.value - self.Q[len(s.rods[0])][len(s.rods[1])][len(s.rods[2])][a]
        else:
            next_action_idx = self.actions.all_actions.index(next_a)
            return (r.value + self.discount_factor*self.Q[len(next_s.rods[0])][len(next_s.rods[1])][len(next_s.rods[2])][next_action_idx]) - self.Q[len(s.rods[0])][len(s.rods[1])][len(s.rods[2])][a]
TowerOfHanoiEnv = Environment(3)  # when using >=4 disks there may be some errors such as the model being
                                  # stuck in a loop. There seems to be a bug. 
LegalActions = Action()
myRLAgent = RLAgent(TowerOfHanoiEnv, LegalActions)
#Q = myRLAgent.train(10)
#print(Q)  # prints the state-value matrix

### DEEP LEARNING FOR SIMPLE CAT IMAGE RECOGNITION. TEMPLATE CODE FOUND AT DEEP LEARNING COURSERA SPECIALIZATION
### PLEASE DON'T LOOK AT THE CODE IF YOU HAVEN'T COMPLETED THE DEEP LEARNING SPECIALIZATION!!!!! AHHHHHHHHH

import numpy as np
import matplotlib.pyplot as plt
import h5py


def load_data():
    train_dataset = h5py.File('train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
def sigmoid(Z):
    """
    Implements the sigmoid activation in numpy
    
    Arguments:
    Z -- numpy array of any shape
    
    Returns:
    A -- output of sigmoid(z), same shape as Z
    cache -- returns Z as well, useful during backpropagation
    """
    
    A = 1/(1+np.exp(-Z))
    cache = Z
    
    return A, cache

def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- Post-activation parameter, of the same shape as Z
    cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    """
    
    A = np.maximum(0,Z)
    
    assert(A.shape == Z.shape)
    
    cache = Z 
    return A, cache


def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """
    
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)            # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1]) #*0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters

def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    
    Z = W.dot(A) + b
    
    assert(Z.shape == (W.shape[0], A.shape[1]))
    cache = (A, W, b)
    
    return Z, cache

def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python dictionary containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    
    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

def L_model_forward(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward() (there is one, indexed L-1)
    """

    caches = []
    A = X
    L = len(parameters) // 2                  # number of layers in the neural network
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for l in range(1, L):
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu")
        caches.append(cache)
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    AL, cache = linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation = "sigmoid")
    caches.append(cache)
    
    assert(AL.shape == (1,X.shape[1]))
            
    return AL, caches

def compute_cost(AL, Y):
    """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[1]

    # Compute loss from aL and y.
    cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ,A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T,dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (there are (L-1) or them, indexes from 0 to L-2)
                the cache of linear_activation_forward() with "sigmoid" (there is one, index L-1)
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
    current_cache = caches[L-1]
    grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
    
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu")
        grads["dA" + str(l + 1)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
        
    return parameters

def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    probas, caches = L_model_forward(X, parameters)

    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p == y)/m)))
        
    return p

def print_mislabeled_images(classes, X, y, p):
    """
    Plots images where predictions and truth were different.
    X -- dataset
    y -- true labels
    p -- predictions
    """
    a = p + y
    mislabeled_indices = np.asarray(np.where(a == 1))
    plt.rcParams['figure.figsize'] = (40.0, 40.0) # set default size of plots
    num_images = len(mislabeled_indices[0])
    for i in range(num_images):
        index = mislabeled_indices[1][i]
        
        plt.subplot(2, num_images, i + 1)
        plt.imshow(X[:,index].reshape(64,64,3), interpolation='nearest')
        plt.axis('off')
        plt.title("Prediction: " + classes[int(p[0,index])].decode("utf-8") + " \n Class: " + classes[y[0,index]].decode("utf-8"))

def L_layer_model(X, Y, layers_dims, learning_rate = 0.009, num_iterations = 3000, print_cost=False):
    """
    Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.

    Arguments:
    X -- data, numpy array of shape (number of examples, num_px * num_px * 3)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps

    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """

    np.random.seed(1)
    costs = []                         # keep track of cost
    
    parameters = initialize_parameters_deep(layers_dims)
    
    for i in range (num_iterations):
        
        AL, caches = L_model_forward(X, parameters)
        
        cost = compute_cost (AL, Y)
        
        grads = L_model_backward(AL, Y, caches)
        
        parameters = update_parameters(parameters, grads, learning_rate)
        
        
    # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)

    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    return parameters
layers_dims = [12288, 20, 7, 5, 1] 
#parameters = L_layer_model(train_x_orig_lr.T, train_y, layers_dims, num_iterations = 2500, print_cost = True)

AI-based solution (also called narrow AI)	Heuristics-based solution (also referred as analytical)
Usually probabilistic in nature	Usually deterministic in nature
Usually poor explainability or interpretability	Usually good explainability and interpretability
Examples involve deep neural networks, reinforcement learning	Examples include linear regression, tree-search algorithms

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	class
count	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000
mean	54.542088	0.676768	3.158249	131.693603	247.350168	0.144781	0.996633	149.599327	0.326599	1.055556	1.602694	0.946128
std	9.049736	0.468500	0.964859	17.762806	51.997583	0.352474	0.994914	22.941562	0.469761	1.166123	0.618187	1.234551
min	29.000000	0.000000	1.000000	94.000000	126.000000	0.000000	0.000000	71.000000	0.000000	0.000000	1.000000	0.000000
25%	48.000000	0.000000	3.000000	120.000000	211.000000	0.000000	0.000000	133.000000	0.000000	0.000000	1.000000	0.000000
50%	56.000000	1.000000	3.000000	130.000000	243.000000	0.000000	1.000000	153.000000	0.000000	0.800000	2.000000	0.000000
75%	61.000000	1.000000	4.000000	140.000000	276.000000	0.000000	2.000000	166.000000	1.000000	1.600000	2.000000	2.000000
max	77.000000	1.000000	4.000000	200.000000	564.000000	1.000000	2.000000	202.000000	1.000000	6.200000	3.000000	4.000000

Dep. Variable:	y	No. Observations:	207
Model:	MNLogit	Df Residuals:	155
Method:	MLE	Df Model:	48
Date:	Sat, 02 Apr 2022	Pseudo R-squ.:	0.3763
Time:	16:07:52	Log-Likelihood:	-164.80
converged:	True	LL-Null:	-264.22
Covariance Type:	nonrobust	LLR p-value:	2.912e-20

y=1	coef	std err	z	P>\|z\|	[0.025	0.975]
x1	-0.0446	0.027	-1.648	0.099	-0.098	0.008
x2	0.9794	0.603	1.625	0.104	-0.202	2.161
x3	0.2384	0.245	0.972	0.331	-0.242	0.719
x4	0.0274	0.013	2.131	0.033	0.002	0.053
x5	0.0029	0.005	0.592	0.554	-0.007	0.013
x6	-1.1738	0.758	-1.549	0.121	-2.659	0.311
x7	0.1337	0.241	0.555	0.579	-0.338	0.606
x8	-0.0440	0.011	-3.945	0.000	-0.066	-0.022
x9	1.1704	0.539	2.173	0.030	0.115	2.226
x10	0.0633	0.287	0.220	0.825	-0.499	0.626
x11	0.2703	0.426	0.635	0.525	-0.564	1.104
x12	0.7982	0.335	2.382	0.017	0.141	1.455
x13	0.2476	0.132	1.875	0.061	-0.011	0.506
y=2	coef	std err	z	P>\|z\|	[0.025	0.975]
x1	-0.0234	0.036	-0.657	0.511	-0.093	0.046
x2	0.6922	0.809	0.856	0.392	-0.894	2.278
x3	0.5337	0.393	1.359	0.174	-0.236	1.303
x4	0.0216	0.017	1.248	0.212	-0.012	0.056
x5	0.0016	0.006	0.260	0.795	-0.011	0.014
x6	-0.1144	0.844	-0.136	0.892	-1.768	1.539
x7	-0.2439	0.329	-0.741	0.459	-0.889	0.402
x8	-0.0626	0.014	-4.469	0.000	-0.090	-0.035
x9	1.2058	0.710	1.698	0.090	-0.186	2.598
x10	0.8793	0.347	2.536	0.011	0.200	1.559
x11	-0.2558	0.615	-0.416	0.677	-1.460	0.949
x12	1.0306	0.394	2.617	0.009	0.259	1.803
x13	0.4167	0.187	2.231	0.026	0.051	0.783
y=3	coef	std err	z	P>\|z\|	[0.025	0.975]
x1	-0.0809	0.040	-2.040	0.041	-0.159	-0.003
x2	0.0744	0.866	0.086	0.932	-1.623	1.771
x3	0.5498	0.435	1.263	0.207	-0.304	1.403
x4	0.0349	0.019	1.852	0.064	-0.002	0.072
x5	-0.0026	0.007	-0.366	0.715	-0.016	0.011
x6	0.3754	0.823	0.456	0.648	-1.237	1.988
x7	0.6569	0.351	1.873	0.061	-0.030	1.344
x8	-0.0625	0.015	-4.042	0.000	-0.093	-0.032
x9	1.1190	0.775	1.443	0.149	-0.401	2.639
x10	0.4566	0.352	1.296	0.195	-0.234	1.147
x11	0.1707	0.649	0.263	0.792	-1.101	1.442
x12	1.6337	0.402	4.067	0.000	0.846	2.421
x13	0.6072	0.213	2.853	0.004	0.190	1.024
y=4	coef	std err	z	P>\|z\|	[0.025	0.975]
x1	-0.1309	0.072	-1.815	0.069	-0.272	0.010
x2	0.2766	1.374	0.201	0.840	-2.417	2.970
x3	0.4453	0.832	0.535	0.593	-1.186	2.077
x4	0.0534	0.038	1.389	0.165	-0.022	0.129
x5	-0.0069	0.011	-0.608	0.543	-0.029	0.015
x6	-2.6535	2.237	-1.186	0.236	-7.038	1.731
x7	1.1721	0.691	1.697	0.090	-0.182	2.526
x8	-0.1056	0.028	-3.714	0.000	-0.161	-0.050
x9	-1.2295	1.285	-0.957	0.339	-3.748	1.289
x10	0.5350	0.557	0.960	0.337	-0.557	1.627
x11	1.7512	1.565	1.119	0.263	-1.316	4.819
x12	1.9947	0.617	3.235	0.001	0.786	3.203
x13	1.1011	0.444	2.482	0.013	0.231	1.971

When to use Artificial Intelligence to solve a problem?¶

An AI solution is preferred to problems which are hard to be defined mathematically¶

An AI solution is preferred when the variables involved in the problem (input features predict an output) don't follow a monotonic relationship and/or can't be linearised¶

Dataset I of cat images: convolutional deep learning vs. logistic regression¶

Dataset II: Coronary artery disease features¶

A symbiosis rather than zero-sum choice¶

Closing remarks: NP, AGI and quantum computing¶

Appendix Section¶

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	class
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	2
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	1
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0