diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 00000000..8f5ca8cf
Binary files /dev/null and b/.DS_Store differ
diff --git a/Eligibility Trace agents/.DS_Store b/Eligibility Trace agents/.DS_Store
new file mode 100644
index 00000000..e63abd15
Binary files /dev/null and b/Eligibility Trace agents/.DS_Store differ
diff --git a/Eligibility Trace agents/Actor-Critic/.DS_Store b/Eligibility Trace agents/Actor-Critic/.DS_Store
new file mode 100644
index 00000000..8efb5831
Binary files /dev/null and b/Eligibility Trace agents/Actor-Critic/.DS_Store differ
diff --git a/Eligibility Trace agents/Actor-Critic/agent_actor_critic.py b/Eligibility Trace agents/Actor-Critic/agent_actor_critic.py
new file mode 100644
index 00000000..d9d9def4
--- /dev/null
+++ b/Eligibility Trace agents/Actor-Critic/agent_actor_critic.py	
@@ -0,0 +1,271 @@
+import math
+import random
+from typing import NamedTuple, Optional, Tuple
+import numpy as np
+from numpy import ndarray
+import logging
+
+from cyberbattle._env import cyberbattle_env
+from agent_wrapper import EnvironmentBounds
+
+import agent_wrapper as w
+from learner import Learner
+
+import torch
+
+class StateActionModel:
+    """How the state is modelled in the enviroment"""
+
+    def __init__(self, ep: EnvironmentBounds):
+        self.ep = ep
+
+        self.global_features = w.ConcatFeatures(ep, [
+            w.Feature_discovered_not_owned_nodes_sliding(ep),
+            w.Feature_discovered_credential_count(ep)
+        ])
+
+        self.source_node_features = w.ConcatFeatures(ep, [
+            w.Feature_active_node_properties(ep),
+            w.Feature_success_actions_at_node(ep)
+        ])
+
+        self.target_node_features = w.ConcatFeatures(ep, [
+            w.Feature_active_node_id(ep)
+        ])
+
+        self.state_space = w.ConcatFeatures(ep, self.global_features.feature_selection +
+                                            self.source_node_features.feature_selection +
+                                            self.target_node_features.feature_selection)
+
+        self.action_space = w.AbstractAction(ep)
+
+    def valid_actions(self, wrapped_env: w.AgentWrapper, observation):
+        """returns a list of valid actions and the nodes they can be carried out from"""
+
+        nodes_and_actions = []
+        discovered_nodes = np.union1d(w.owned_nodes(observation), w.discovered_nodes_notowned(observation))
+
+        for from_node in w.owned_nodes(observation):
+            for local_action in range(self.action_space.n_local_actions):
+                trial_action = self.action_space.abstract_to_gymaction(from_node, observation, local_action, None)
+                if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                    nodes_and_actions.append((from_node, local_action, -1))
+
+            for remote_action in range(self.action_space.n_local_actions, self.action_space.n_local_actions + self.action_space.n_remote_actions):
+                for target_node in discovered_nodes:
+                    if target_node != from_node:
+                        trial_action = self.action_space.abstract_to_gymaction(from_node, observation, remote_action, target_node)
+                        if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                            nodes_and_actions.append((from_node, remote_action, target_node))
+
+            for connect_action in range(self.action_space.n_local_actions + self.action_space.n_remote_actions, self.action_space.n_actions):
+                trial_action = self.action_space.abstract_to_gymaction(from_node, observation, connect_action, None)
+                if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                    nodes_and_actions.append((from_node, connect_action, -1))
+
+        return nodes_and_actions
+
+class Memory:
+    """The memory structure that stores the critic value function and the actors state action policy"""
+
+    def __init__(self, ep:EnvironmentBounds, hash_size):
+        self.hash_size = hash_size
+
+        self.actor = torch.zeros([2, hash_size], dtype=torch.float64)
+
+        self.critic = torch.zeros([2, hash_size], dtype=torch.float64)
+
+    def state_action_index(self, state_space, abstract_action):
+        """Turns a state action pair into an index for the actor tensor"""
+        feature_vector = np.append(state_space, abstract_action)
+        hash_number = abs(hash(str(feature_vector)))
+        return hash_number % self.hash_size
+
+    def state_index(self, state_space):
+        """Turns the state into an index for the critic tensor"""
+        hash_number = abs(hash(str(state_space)))
+        return hash_number % self.hash_size
+        
+
+class ChosenActionMetadata(NamedTuple):
+    """Metadata attached to every gym action"""
+    
+    abstract_action: np.int32
+    actor_node: int
+    actor_features: ndarray
+    actor_state: ndarray
+
+    def __repr__(self) -> str:
+        return f"[abstract_action={self.abstract_action}, actor={self.actor_node}, state={self.actor_state}]"
+
+class ActorCriticPolicy(Learner):
+
+    def __init__(self,
+                 ep: EnvironmentBounds,
+                 gamma: float,
+                 λ: float,
+                 learning_rate: float,
+                 hash_size: int
+                 ):
+        
+        self.n_local_actions = ep.local_attacks_count
+        self.n_remote_actions = ep.remote_attacks_count
+        self.model = StateActionModel(ep)
+        self.gamma = gamma
+        self.λ = λ
+        self.learning_rate = learning_rate
+        self.hash_size = hash_size
+
+        self.memory = Memory(ep, hash_size=hash_size)
+
+    def parameters_as_string(self):
+        return f'γ={self.gamma}, lr={self.learning_rate}, λ={self.λ},\n' \
+               f'hash_size={self.hash_size}'
+
+    def all_parameters_as_string(self) -> str:
+        model = self.model
+        return f'{self.parameters_as_string()}\n' \
+            f'dimension={model.state_space.flat_size()}x{model.action_space.flat_size()}, ' \
+            f'Q={[f.name() for f in model.state_space.feature_selection]} ' \
+            f"-> 'abstract_action'"
+
+    def get_actor_state_vector(self, global_state: ndarray, actor_features: ndarray, target_features: Optional[ndarray]) -> ndarray:
+        """Turns seperate state features into one vector"""
+        if target_features is None:
+            return np.concatenate((np.array(global_state, dtype=np.float32),
+                                np.array(actor_features, dtype=np.float32)))
+        else:
+            return np.concatenate((np.array(global_state, dtype=np.float32),
+                                np.array(actor_features, dtype=np.float32),
+                                np.array(target_features, dtype=np.float32)))
+
+    def update_memory(self, 
+                    reward: float,
+                    actor_state: ndarray,
+                    abstract_action: int,
+                    next_actor_state: Optional[ndarray]):
+        """The actor's and critic's memories are updated with reward from the action just used"""
+
+        #The temporal difference error, δ, is calculated then used to update the actor and critic
+        current_state_index = self.memory.state_index(actor_state)
+        if next_actor_state is None:
+            δ = reward - self.memory.critic[0][current_state_index].item()
+        else:
+            next_state_index = self.memory.state_index(next_actor_state)
+            δ = reward + (self.gamma * self.memory.critic[0][next_state_index].item()) - self.memory.critic[0][current_state_index].item()
+
+        #Update the Actor
+        current_state_action_index = self.memory.state_action_index(actor_state, abstract_action)
+
+        self.memory.actor[1][current_state_action_index] += 1
+
+        self.memory.actor[0][current_state_action_index] += self.learning_rate * δ * self.memory.actor[1][current_state_action_index].item()
+        self.memory.actor[0][current_state_action_index] = round(self.memory.actor[0][current_state_action_index].item(), 5)
+        self.memory.actor[0][current_state_action_index] = max(0, self.memory.actor[0][current_state_action_index].item())
+        self.memory.actor[0][current_state_action_index] = min(100, self.memory.actor[0][current_state_action_index].item())
+
+        non_zero_indicies = torch.argwhere(self.memory.actor[1]).numpy()
+        for i in non_zero_indicies:
+            self.memory.actor[1][i] = self.memory.actor[1][i].item() * self.gamma * self.λ
+
+        #Update the Critic
+        self.memory.critic[1][current_state_index] += 1
+
+        non_zero_indicies_v = torch.argwhere(self.memory.critic[0]).numpy()
+        non_zero_indicies_e = torch.argwhere(self.memory.critic[1]).numpy()
+        non_zero_indicies = np.union1d(non_zero_indicies_v, non_zero_indicies_e)
+
+        for i in non_zero_indicies:
+
+            self.memory.critic[0][i] = self.memory.critic[0][i].item() + (self.learning_rate * δ * self.memory.critic[1][i].item())
+            self.memory.critic[1][i] = self.memory.critic[1][i].item() * self.gamma * self.λ
+            self.memory.critic[0][i] = max(0, self.memory.critic[0][i].item())
+
+    def on_step(self, wrapped_env: w.AgentWrapper, reward: float, done: bool, action_metadata):
+
+        if done:
+            self.update_memory(reward,
+                            actor_state=action_metadata.actor_state,
+                            abstract_action=action_metadata.abstract_action,
+                            next_actor_state=None       
+                            )
+        else:
+            self.update_memory(reward,
+                            actor_state=action_metadata.actor_state,
+                            abstract_action=action_metadata.abstract_action,
+                            next_actor_state=wrapped_env.state
+                            )
+        
+
+    def new_episode(self):
+        torch.mul(self.memory.actor[1], 0)
+        torch.mul(self.memory.critic[1], 0)
+
+    def end_of_episode(self, i_episode, t):
+        return None
+
+    def end_of_iteration(self, t, done):
+        return None
+
+    def metadata_from_gymaction(self, wrapped_env: w.AgentWrapper, gym_action):
+        """Takes in a gym action and returns it's metadata"""
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        actor_node = cyberbattle_env.sourcenode_of_action(gym_action)
+        actor_features = self.model.source_node_features.get(wrapped_env.state, actor_node)
+        abstract_action = self.model.action_space.abstract_from_gymaction(gym_action)
+
+        if 'remote_vulnerability' in gym_action:
+            target_node = self.model.target_node_features.get(wrapped_env.state, gym_action['remote_vulnerability'][1])
+        else:
+            target_node = None
+
+        return ChosenActionMetadata(
+            abstract_action=abstract_action,
+            actor_node=actor_node,
+            actor_features=actor_features,
+            actor_state=self.get_actor_state_vector(current_global_state, actor_features, target_node))
+
+    def get_action(self, wrapped_env: w.AgentWrapper, observation, exploit) -> Tuple[str, Optional[cyberbattle_env.Action], object, float]:
+        """Uses Gibbs Softmax distribution to select the next action to be used"""
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation)
+
+        #The p_values are the estimated returns from the actor function of taking the action in the current state
+        p_values = []
+        for item in valid_nodes_and_actions:
+            source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0])
+
+            if item[1] < self.n_local_actions or item[1] - self.n_local_actions > self.n_remote_actions:
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None)
+            else:
+                target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2])
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features)
+
+            action_state_index = self.memory.state_action_index(actor_state_vector, item[1])
+
+            p_values.append(self.memory.actor[0][action_state_index].item())
+
+        if exploit:
+            indicies_of_chosen_actions = [i for i, x in enumerate(p_values) if x == max(p_values)]
+            chosen_action_index = random.choice(indicies_of_chosen_actions)
+            chosen_action = valid_nodes_and_actions[chosen_action_index]
+
+        else:
+            softmax_denominator = 0
+            for p_value in p_values:
+                softmax_denominator += math.exp(p_value)
+
+            probabilities = []
+            for p_value in p_values:
+                probabilities.append(math.exp(p_value) / softmax_denominator)
+
+            chosen_action = random.choices(valid_nodes_and_actions, weights=probabilities, k=1)[0]
+
+        if chosen_action[1] < self.n_local_actions or chosen_action[1] - self.n_local_actions > self.n_remote_actions:
+            gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], None)
+        else:
+            gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], chosen_action[2])
+
+        metadata = self.metadata_from_gymaction(wrapped_env, gym_action)
+
+        return gym_action, metadata
\ No newline at end of file
diff --git a/Eligibility Trace agents/Actor-Critic/agent_wrapper.py b/Eligibility Trace agents/Actor-Critic/agent_wrapper.py
new file mode 100644
index 00000000..985aa1e9
--- /dev/null
+++ b/Eligibility Trace agents/Actor-Critic/agent_wrapper.py	
@@ -0,0 +1,342 @@
+from cyberbattle._env.cyberbattle_env import EnvironmentBounds
+from typing import Optional, List
+import enum
+import numpy as np
+from gym import spaces, Wrapper
+from numpy import ndarray
+import cyberbattle._env.cyberbattle_env as cyberbattle_env
+import logging
+
+
+class StateAugmentation:
+    """Default agent state augmentation, consisting of the gym environment
+    observation itself and nothing more."""
+
+    def __init__(self, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+    def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+    def on_reset(self, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+
+class Feature(spaces.MultiDiscrete):
+    """
+    Feature consisting of multiple discrete dimensions.
+    Parameters:
+        nvec: is a vector defining the number of possible values
+        for each discrete space.
+    """
+
+    def __init__(self, env_properties: EnvironmentBounds, nvec):
+        self.env_properties = env_properties
+        super().__init__(nvec)
+
+    def flat_size(self):
+        return np.prod(self.nvec)
+
+    def name(self):
+        """Return the name of the feature"""
+        p = len(type(Feature(self.env_properties, [])).__name__) + 1
+        return type(self).__name__[p:]
+
+    def get(self, a: StateAugmentation, node: Optional[int]) -> np.ndarray:
+        """Compute the current value of a feature value at
+        the current observation and specific node"""
+        raise NotImplementedError
+
+    def pretty_print(self, v):
+        return v
+
+class Feature_active_node_properties(Feature):
+    """Bitmask of all properties set for the active node"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [2] * p.property_count)
+
+    def get(self, a: StateAugmentation, node) -> ndarray:
+        assert node is not None, 'feature only valid in the context of a node'
+
+        node_prop = a.observation['discovered_nodes_properties']
+
+        # list of all properties set/unset on the node
+        # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0)
+        assert node < len(node_prop), f'invalid node index {node} (not discovered yet)'
+        remapped = np.array((1 + node_prop[node]) / 2, dtype=int)
+        return remapped
+
+
+class Feature_active_node_id(Feature):
+    """Return the node id itself"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count] * 1)
+
+    def get(self, a: StateAugmentation, node) -> ndarray:
+        return np.array([node], dtype=int)
+
+
+class Feature_discovered_credential_count(Feature):
+    """number of credentials discovered so far"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_total_credentials + 1])
+
+    def get(self, a: StateAugmentation, node):
+        return [len(a.observation['credential_cache_matrix'])]
+
+
+class Feature_discovered_not_owned_nodes_sliding(Feature):
+    """array of which of discovered nodes not owned by name"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count + 1])
+
+    def get(self, a: StateAugmentation, node):
+        discovered = a.observation['discovered_nodes']
+        levels = a.observation['nodes_privilegelevel']
+        owned_nodes_indices = np.where(levels > 0)[0]
+        owned = []
+        for i in owned_nodes_indices:
+            owned.append(discovered[i])
+        discovered_not_owned = []
+        for node in discovered:
+            if node not in owned:
+                discovered_not_owned.append(node)
+        discovered_not_owned_sliding = np.zeros(self.env_properties.maximum_node_count, np.int32)
+        for node_id in discovered_not_owned:
+            if node_id == 'client':
+                discovered_not_owned_sliding[0] = 1
+            elif node_id == 'Website':
+                discovered_not_owned_sliding[1] = 1
+            elif node_id == 'Website.Directory':
+                discovered_not_owned_sliding[2] = 1
+            elif node_id == 'Website[user=monitor]':
+                discovered_not_owned_sliding[3] = 1
+            elif node_id == 'GitHubProject':
+                discovered_not_owned_sliding[4] = 1
+            elif node_id == 'AzureStorage':
+                discovered_not_owned_sliding[5] = 1
+            elif node_id == 'Sharepoint':
+                discovered_not_owned_sliding[6] = 1
+            elif node_id == 'AzureResourceManager':
+                discovered_not_owned_sliding[7] = 1
+            elif node_id == 'AzureResourceManager[user-monitor]':
+                discovered_not_owned_sliding[8] = 1
+            elif node_id == 'AzureVM':
+                discovered_not_owned_sliding[9] = 1
+        return discovered_not_owned_sliding
+        
+class Feature_active_node_id(Feature):
+    """number asigned to each type of node in toy-ctf"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count + 1])
+
+    def get(self, a: StateAugmentation, node):
+        node_id = a.observation['discovered_nodes'][node]
+        
+        node_id_array = np.zeros(1, np.int32)
+        if node_id == 'client':
+            node_id_array[0] = 0
+        elif node_id == 'Website':
+            node_id_array[0] = 1
+        elif node_id == 'Website.Directory':
+            node_id_array[0] = 2
+        elif node_id == 'Website[user=monitor]':
+            node_id_array[0] = 3
+        elif node_id == 'GitHubProject':
+            node_id_array[0] = 4
+        elif node_id == 'AzureStorage':
+            node_id_array[0] = 5
+        elif node_id == 'Sharepoint':
+            node_id_array[0] = 6
+        elif node_id == 'AzureResourceManager':
+            node_id_array[0] = 7
+        elif node_id == 'AzureResourceManager[user-monitor]':
+            node_id_array[0] = 8
+        elif node_id == 'AzureVM':
+            node_id_array[0] = 9
+        else:
+            node_id_array[0] = 10
+        return node_id_array
+
+
+class ConcatFeatures(Feature):
+    """ Concatenate a list of features into a single feature
+    Parameters:
+        feature_selection - a selection of features to combine
+    """
+
+    def __init__(self, p: EnvironmentBounds, feature_selection: List[Feature]):
+        self.feature_selection = feature_selection
+        self.dim_sizes = np.concatenate([f.nvec for f in feature_selection])
+        super().__init__(p, [self.dim_sizes])
+
+    def pretty_print(self, v):
+        return v
+
+    def get(self, a: StateAugmentation, node=None) -> np.ndarray:
+        """Return the feature vector"""
+        feature_vector = [f.get(a, node) for f in self.feature_selection]
+        return np.concatenate(feature_vector)
+
+
+def owned_nodes(observation):
+    """Return the list of owned nodes"""
+    return np.nonzero(observation['nodes_privilegelevel'])[0]
+
+
+def discovered_nodes_notowned(observation):
+    """Return the list of discovered nodes that are not owned yet"""
+    return np.nonzero(observation['nodes_privilegelevel'] == 0)[0]
+
+
+class AbstractAction(Feature):
+    """An abstraction of the gym state space that reduces
+    the space dimension for learning use to just
+        - local_attack(vulnid)    (source_node provided)
+        - remote_attack(vulnid)   (source_node provided, target_node forgotten)
+        - connect(port)           (source_node provided, target_node forgotten, credentials infered from cache)
+    """
+
+    def __init__(self, p: EnvironmentBounds):
+        self.n_local_actions = p.local_attacks_count
+        self.n_remote_actions = p.remote_attacks_count
+        self.n_connect_actions = p.port_count
+        self.n_actions = self.n_local_actions + self.n_remote_actions + self.n_connect_actions
+        super().__init__(p, [self.n_actions])
+
+    def abstract_to_gymaction(self, source_node, observation, abstract_action, target_node):
+        """Takes a statring node and an abstract action number and returns a gym action"""
+
+        if abstract_action < self.n_local_actions:
+            vuln = abstract_action
+            return {'local_vulnerability': np.array([source_node, vuln])}
+
+        node_prop = observation['discovered_nodes_properties']
+        abstract_action -= self.n_local_actions
+        if abstract_action < self.n_remote_actions:
+            vuln = abstract_action
+
+            discovered_nodes_count = len(node_prop)
+            if discovered_nodes_count <= 1:
+                return None
+
+            return {'remote_vulnerability': np.array([source_node, target_node, vuln])}
+
+        abstract_action -= self.n_remote_actions
+        port = np.int32(abstract_action)
+
+        discovered_credentials = np.array(observation['credential_cache_matrix'])
+        n_discovered_creds = len(discovered_credentials)
+        if n_discovered_creds <= 0:
+            return None
+
+        nodes_not_owned = discovered_nodes_notowned(observation)
+        match_port = discovered_credentials[:, 1] == port
+        match_port_indicies = np.where(match_port)[0]
+
+        credential_indices_choices = [c for c in match_port_indicies
+                                        if discovered_credentials[c, 0] in nodes_not_owned]
+
+        if credential_indices_choices:
+            logging.debug('found matching cred in the credential cache')
+        else:
+            logging.debug('no cred matching requested port, trying instead creds used to access other ports')
+            credential_indices_choices = [i for (i, n) in enumerate(discovered_credentials[:, 0])
+                                          if n in nodes_not_owned]
+
+            if credential_indices_choices:
+                logging.debug('found cred in the credential cache without matching port name')
+            else:
+                logging.debug('no cred to use from the credential cache')
+                return None
+
+        cred = np.int32(np.random.choice(credential_indices_choices))
+        target = np.int32(discovered_credentials[cred, 0])
+        return {'connect': np.array([source_node, target, port, cred], dtype=np.int32)}
+
+    def abstract_from_gymaction(self, gym_action: cyberbattle_env.Action) -> np.int32:
+        """Turns a gym action into it's abstract action number"""
+        if 'local_vulnerability' in gym_action:
+            return gym_action['local_vulnerability'][1]
+        elif 'remote_vulnerability' in gym_action:
+            r = gym_action['remote_vulnerability']
+            return self.n_local_actions + r[2]
+
+        assert 'connect' in gym_action
+        c = gym_action['connect']
+
+        a = self.n_local_actions + self.n_remote_actions + c[2]
+        assert a < self.n_actions
+        return np.int32(a)
+
+
+class ActionTrackingStateAugmentation(StateAugmentation):
+    """An agent state augmentation consisting of
+    the environment observation augmented with the following dynamic information:
+       - success_action_count: count of action taken and succeeded at the current node
+       - failed_action_count: count of action taken and failed at the current node
+     """
+
+    def __init__(self, p: EnvironmentBounds, observation: cyberbattle_env.Observation):
+        self.aa = AbstractAction(p)
+        self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.env_properties = p
+        super().__init__(observation)
+
+    def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation):
+        node = cyberbattle_env.sourcenode_of_action(action)
+        abstract_action = self.aa.abstract_from_gymaction(action)
+        if reward > 0:
+            self.success_action_count[node, abstract_action] += 1
+        else:
+            self.failed_action_count[node, abstract_action] += 1
+        super().on_step(action, reward, done, observation)
+
+    def on_reset(self, observation: cyberbattle_env.Observation):
+        p = self.env_properties
+        self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        super().on_reset(observation)
+
+
+class Feature_success_actions_at_node(Feature):
+    """number of time each action succeeded at a given node"""
+
+    max_action_count = 100
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [self.max_action_count] * AbstractAction(p).n_actions)
+
+    def get(self, a: ActionTrackingStateAugmentation, node: int):
+        return np.minimum(a.success_action_count[node, :], self.max_action_count - 1)
+
+
+class Verbosity(enum.Enum):
+    """Verbosity of the learning function"""
+    Quiet = 0
+    Normal = 1
+    Verbose = 2
+
+
+class AgentWrapper(Wrapper):
+    """Gym wrapper to update the agent state on every step"""
+
+    def __init__(self, env: cyberbattle_env.CyberBattleEnv, state: StateAugmentation):
+        super().__init__(env)
+        self.state = state
+
+    def step(self, action: cyberbattle_env.Action):
+        observation, reward, done, info = self.env.step(action)
+        self.state.on_step(action, reward, done, observation)
+        return observation, reward, done, info
+
+    def reset(self):
+        observation = self.env.reset()
+        self.state.on_reset(observation)
+        return observation
\ No newline at end of file
diff --git a/Eligibility Trace agents/Actor-Critic/learner.py b/Eligibility Trace agents/Actor-Critic/learner.py
new file mode 100644
index 00000000..60ad359c
--- /dev/null
+++ b/Eligibility Trace agents/Actor-Critic/learner.py	
@@ -0,0 +1,232 @@
+from cmath import pi
+import math
+import sys
+
+from plotting import PlotTraining, plot_averaged_cummulative_rewards
+from agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation
+import logging
+import numpy as np
+from cyberbattle._env import cyberbattle_env
+from typing import Tuple, Optional, TypedDict, List
+import progressbar
+import abc
+
+class Agent(abc.ABC):
+
+    @abc.abstractmethod
+    def get_action(self, wrapped_env: AgentWrapper, observation, exploit) -> Tuple[str, Optional[cyberbattle_env.Action], object, float]:
+        """Exploit function.
+        Returns (action_type, gym_action, action_metadata) where
+        action_metadata is a custom object that gets passed to the on_step callback function"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def on_step(self, wrapped_env: AgentWrapper, reward, done, action_metadata,π) -> None:
+        raise NotImplementedError
+
+    def parameters_as_string(self) -> str:
+        return ''
+
+    def all_parameters_as_string(self) -> str:
+        return ''
+
+    def loss_as_string(self) -> str:
+        return ''
+
+    def stateaction_as_string(self, action_metadata) -> str:
+        return ''
+
+Breakdown = TypedDict('Breakdown', {
+    'local': int,
+    'remote': int,
+    'connect': int
+})
+
+Outcomes = TypedDict('Outcomes', {
+    'reward': Breakdown,
+    'noreward': Breakdown
+})
+
+Stats = TypedDict('Stats', {
+    'exploit': Outcomes,
+    'explore': Outcomes,
+    'exploit_deflected_to_explore': int
+})
+
+TrainedAgent = TypedDict('TrainedAgent', {
+    'all_episodes_rewards': List[List[float]],
+    'all_episodes_availability': List[List[float]],
+    'agent': Agent,
+    'trained_on': str,
+    'title': str
+})
+
+def print_stats(stats):
+    """Print learning statistics"""
+    print("  Breakdown [Reward/NoReward (Success rate)]")
+    def ratio(kind: str) -> str:
+        x, y = stats['reward'][kind], stats['noreward'][kind]
+        sum = x + y
+        if sum == 0:
+            return 'NaN'
+        else:
+            return f"{(x / sum):.2f}"
+
+    def print_kind(kind: str):
+        print(
+            f"    {kind}: {stats['reward'][kind]}/{stats['noreward'][kind]} "
+            f"({ratio(kind)})")
+    print_kind('local')
+    print_kind('remote')
+    print_kind('connect')
+
+def gibbs_softmax_search(
+    cyberbattle_gym_env: cyberbattle_env.CyberBattleEnv,
+    environment_properties: EnvironmentBounds,
+    agent: Agent,
+    title: str,
+    episode_count: int,
+    iteration_count: int,
+    exploit: bool,
+    render=True,
+    render_last_episode_rewards_to: Optional[str] = None,
+    verbosity: Verbosity = Verbosity.Normal,
+    plot_episodes_length=True
+) -> TrainedAgent:
+
+    print(f"###### {title}\n"
+          f"Learning with: episode_count={episode_count},"
+          f"iteration_count={iteration_count}," +
+          f"{agent.parameters_as_string()}")
+
+    all_episodes_rewards = []
+    all_episodes_availability = []
+
+    wrapped_env = AgentWrapper(cyberbattle_gym_env,
+                               ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset()))
+    steps_done = 0
+    plot_title = f"{title} (epochs={episode_count}"  \
+        + agent.parameters_as_string()
+    plottraining = PlotTraining(title=plot_title, render_each_episode=render)
+
+    render_file_index = 1
+
+    for i_episode in range(1, episode_count + 1):
+
+        print(f"\n  ## Episode: {i_episode}/{episode_count} '{title}' "
+              f"{agent.parameters_as_string()}")
+
+        observation = wrapped_env.reset()
+        total_reward = 0.0
+        all_rewards = []
+        all_availability = []
+        agent.new_episode()
+
+        stats = Stats(Outcomes(reward=Breakdown(local=0, remote=0, connect=0),
+                                    noreward=Breakdown(local=0, remote=0, connect=0))
+                      )
+
+        episode_ended_at = None
+        sys.stdout.flush()
+
+        bar = progressbar.ProgressBar(
+            widgets=[
+                'Episode ',
+                f'{i_episode}',
+                '|Iteration ',
+                progressbar.Counter(),
+                '|',
+                progressbar.Variable(name='reward', width=6, precision=10),
+                '|',
+                progressbar.Variable(name='last_reward_at', width=4),
+                '|',
+                progressbar.Timer(),
+                progressbar.Bar()
+            ],
+            redirect_stdout=False)
+
+        for t in bar(range(1, 1 + iteration_count)):
+
+            steps_done += 1
+
+            gym_action, action_metadata = agent.get_action(wrapped_env, observation, exploit)
+            
+            # Take the step
+            logging.debug(f"gym_action={gym_action}, action_metadata={action_metadata}")
+            observation, reward, done, info = wrapped_env.step(gym_action)
+
+            outcome = 'reward' if reward > 0 else 'noreward'
+            if 'local_vulnerability' in gym_action:
+                stats[outcome]['local'] += 1
+            elif 'remote_vulnerability' in gym_action:
+                stats[outcome]['remote'] += 1
+            else:
+                stats[outcome]['connect'] += 1
+
+            agent.on_step(wrapped_env, reward, done, action_metadata)
+            assert np.shape(reward) == ()
+
+            all_rewards.append(reward)
+            all_availability.append(info['network_availability'])
+            total_reward += reward
+            bar.update(t, reward=total_reward)
+            if reward > 0:
+                bar.update(t, last_reward_at=t)
+
+            if verbosity == Verbosity.Verbose or (verbosity == Verbosity.Normal and reward > 0):
+                sign = ['-', '+'][reward > 0]
+
+                print(f"    {sign} t={t} r={reward} cum_reward:{total_reward} "
+                      f"a={action_metadata}-{gym_action} "
+                      f"creds={len(observation['credential_cache_matrix'])} "
+                      f" {agent.stateaction_as_string(action_metadata)}")
+
+            if i_episode == episode_count \
+                    and render_last_episode_rewards_to is not None \
+                    and reward > 0:
+                fig = cyberbattle_gym_env.render_as_fig()
+                fig.write_image(f"{render_last_episode_rewards_to}-e{i_episode}-{render_file_index}.png")
+                render_file_index += 1
+
+            agent.end_of_iteration(t, done)
+
+            if done:
+                episode_ended_at = t
+                bar.finish(dirty=True)
+                break
+
+        sys.stdout.flush()
+
+        loss_string = agent.loss_as_string()
+        if loss_string:
+            loss_string = "loss={loss_string}"
+
+        if episode_ended_at:
+            print(f"  Episode {i_episode} ended at t={episode_ended_at} {loss_string}")
+        else:
+            print(f"  Episode {i_episode} stopped at t={iteration_count} {loss_string}")
+
+        print_stats(stats)
+
+        all_episodes_rewards.append(all_rewards)
+        all_episodes_availability.append(all_availability)
+
+        length = episode_ended_at if episode_ended_at else iteration_count
+        agent.end_of_episode(i_episode=i_episode, t=length)
+        if plot_episodes_length:
+            plottraining.episode_done(length)
+        if render:
+            wrapped_env.render()
+
+    wrapped_env.close()
+    print("simulation ended")
+    if plot_episodes_length:
+        plottraining.plot_end()
+
+    return TrainedAgent(
+        all_episodes_rewards=all_episodes_rewards,
+        all_episodes_availability=all_episodes_availability,
+        agent=agent,
+        trained_on=cyberbattle_gym_env.name,
+        title=plot_title
+    )
\ No newline at end of file
diff --git a/Eligibility Trace agents/Actor-Critic/plotting.py b/Eligibility Trace agents/Actor-Critic/plotting.py
new file mode 100644
index 00000000..e51b4a55
--- /dev/null
+++ b/Eligibility Trace agents/Actor-Critic/plotting.py	
@@ -0,0 +1,203 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Plotting helpers for agent banchmarking"""
+
+import matplotlib.pyplot as plt  # type:ignore
+import numpy as np
+
+
+def new_plot(title):
+    """Prepare a new plot of cumulative rewards"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('cumulative reward', fontsize=20)
+    plt.xlabel('step', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title(title, fontsize=12)
+
+
+def pad(array, length):
+    """Pad an array with 0s to make it of desired length"""
+    padding = np.zeros((length,))
+    padding[:len(array)] = array
+    return padding
+
+
+def plot_episodes_rewards_averaged(results):
+    """Plot cumulative rewards for a given set of specified episodes"""
+    max_iteration_count = np.max([len(r) for r in results['all_episodes_rewards']])
+
+    all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in results['all_episodes_rewards']]
+    cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1)
+    avg = np.average(cumrewards, axis=0)
+    std = np.std(cumrewards, axis=0)
+    x = [i for i in range(len(std))]
+    plt.plot(x, avg, label=results['title'])
+    plt.fill_between(x, avg - std, avg + std, alpha=0.5)
+
+
+def fill_with_latest_value(array, length):
+    pad = length - len(array)
+    if pad > 0:
+        return np.pad(array, (0, pad), mode='edge')
+    else:
+        return array
+
+
+def plot_episodes_availability_averaged(results):
+    """Plot availability for a given set of specified episodes"""
+    data = results['all_episodes_availability']
+    longest_episode_length = np.max([len(r) for r in data])
+
+    all_episodes_padded = [fill_with_latest_value(av, longest_episode_length) for av in data]
+    avg = np.average(all_episodes_padded, axis=0)
+    std = np.std(all_episodes_padded, axis=0)
+    x = [i for i in range(len(std))]
+    plt.plot(x, avg, label=results['title'])
+    plt.fill_between(x, avg - std, avg + std, alpha=0.5)
+
+
+def plot_episodes_length(learning_results):
+    """Plot length of every episode"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('#iterations', fontsize=20)
+    plt.xlabel('episode', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title("Length of each episode", fontsize=12)
+
+    for results in learning_results:
+        iterations = [len(e) for e in results['all_episodes_rewards']]
+        episode = [i for i in range(len(results['all_episodes_rewards']))]
+        plt.plot(episode, iterations, label=f"{results['title']}")
+
+    plt.legend(loc="upper right")
+    plt.show()
+
+
+def plot_each_episode(results):
+    """Plot cumulative rewards for each episode"""
+    for i, episode in enumerate(results['all_episodes_rewards']):
+        cumrewards = np.cumsum(episode)
+        x = [i for i in range(len(cumrewards))]
+        plt.plot(x, cumrewards, label=f'Episode {i}')
+
+
+def plot_all_episodes(r):
+    """Plot cumulative rewards for every episode"""
+    new_plot(r['title'])
+    plot_each_episode(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def plot_averaged_cummulative_rewards(title, all_runs):
+    """Plot averaged cumulative rewards"""
+    new_plot(title)
+    for r in all_runs:
+        plot_episodes_rewards_averaged(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def plot_averaged_availability(title, all_runs):
+    """Plot averaged network availability"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('network availability', fontsize=20)
+    plt.xlabel('step', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title(title, fontsize=12)
+    for r in all_runs:
+        plot_episodes_availability_averaged(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def new_plot_loss():
+    """Plot MSE loss averaged over all episodes"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('loss', fontsize=20)
+    plt.xlabel('episodes', fontsize=20)
+    plt.xticks(size=12)
+    plt.yticks(size=20)
+    plt.title("Loss", fontsize=12)
+
+
+def plot_all_episodes_loss(all_episodes_losses, name, label):
+    """Plot loss for one learning episode"""
+    x = [i for i in range(len(all_episodes_losses))]
+    plt.plot(x, all_episodes_losses, label=f'{name} {label}')
+
+
+def running_mean(x, size):
+    """return moving average of x for a window of lenght 'size'"""
+    cumsum = np.cumsum(np.insert(x, 0, 0))
+    return (cumsum[size:] - cumsum[:-size]) / float(size)
+
+
+class PlotTraining:
+    """Plot training-related stats"""
+
+    def __init__(self, title, render_each_episode):
+        self.episode_durations = []
+        self.title = title
+        self.render_each_episode = render_each_episode
+
+    def plot_durations(self, average_window=5):
+        # plt.figure(2)
+        plt.figure()
+        # plt.clf()
+        durations_t = np.array(self.episode_durations, dtype=np.float32)
+        plt.title('Training...')
+        plt.xlabel('Episode')
+        plt.ylabel('Duration')
+        plt.title(self.title, fontsize=12)
+
+        episodes = [i + 1 for i in range(len(self.episode_durations))]
+        plt.plot(episodes, durations_t)
+        # plot episode running averages
+        if len(durations_t) >= average_window:
+            means = running_mean(durations_t, average_window)
+            means = np.concatenate((np.zeros(average_window - 1), means))
+            plt.plot(episodes, means)
+
+        # display.display(plt.gcf())
+        plt.show()
+
+    def episode_done(self, length):
+        self.episode_durations.append(length)
+        if self.render_each_episode:
+            self.plot_durations()
+
+    def plot_end(self):
+        self.plot_durations()
+        plt.ioff()  # type: ignore
+        # plt.show()
+
+
+def length_of_all_episodes(run):
+    """Get the length of every episode"""
+    return [len(e) for e in run['all_episodes_rewards']]
+
+
+def reduce(x, desired_width):
+    return [np.average(c) for c in np.array_split(x, desired_width)]
+
+
+def episodes_rewards_averaged(run):
+    """Plot cumulative rewards for a given set of specified episodes"""
+    max_iteration_count = np.max([len(r) for r in run['all_episodes_rewards']])
+    all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in run['all_episodes_rewards']]
+    cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1)
+    avg = np.average(cumrewards, axis=0)
+    return list(avg)
+
+
+def episodes_lengths_for_all_runs(all_runs):
+    return [length_of_all_episodes(run) for run in all_runs]
+
+
+def averaged_cummulative_rewards(all_runs, width):
+    return [reduce(episodes_rewards_averaged(run), width) for run in all_runs]
diff --git a/Eligibility Trace agents/Notebooks/.DS_Store b/Eligibility Trace agents/Notebooks/.DS_Store
new file mode 100644
index 00000000..bfaabc34
Binary files /dev/null and b/Eligibility Trace agents/Notebooks/.DS_Store differ
diff --git a/Eligibility Trace agents/Notebooks/Actor Critic Testing Basic.ipynb b/Eligibility Trace agents/Notebooks/Actor Critic Testing Basic.ipynb
new file mode 100644
index 00000000..9c73413c
--- /dev/null
+++ b/Eligibility Trace agents/Notebooks/Actor Critic Testing Basic.ipynb	
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fae4d125",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import logging\n",
+    "import gym\n",
+    "from cyberbattle._env.cyberbattle_env import AttackerGoal\n",
+    "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Actor-Critic')\n",
+    "import agent_actor_critic as a\n",
+    "import agent_wrapper as w\n",
+    "import learner as learner\n",
+    "from agent_wrapper import Verbosity\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3928c9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n",
+    "                   attacker_goal=AttackerGoal(\n",
+    "                       reward=430,\n",
+    "                       own_atleast_percent=0.6))\n",
+    "\n",
+    "ep = w.EnvironmentBounds.of_identifiers(\n",
+    "    maximum_node_count=12,\n",
+    "    maximum_total_credentials=10,\n",
+    "    identifiers=gym_env.identifiers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fca12ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_five_episodes = learner.gibbs_softmax_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.ActorCriticPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n",
+    "    episode_count=5,\n",
+    "    iteration_count=1000,\n",
+    "    exploit=False,\n",
+    "    render=False,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Actor-Critic five\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c7cc52d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_exploit_five = learner.gibbs_softmax_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=actor_critic_five_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "exploit=True,\n",
+    "render=False,\n",
+    "plot_episodes_length=True,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Actor_Critic five\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26cefa42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_ten_episodes = learner.gibbs_softmax_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.ActorCriticPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n",
+    "    episode_count=10,\n",
+    "    iteration_count=1000,\n",
+    "    exploit=False,\n",
+    "    render=False,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Actor-Critic ten\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "809d50f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_exploit_ten = learner.gibbs_softmax_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=actor_critic_ten_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "exploit=True,\n",
+    "render=False,\n",
+    "plot_episodes_length=True,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Actor_Critic ten\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1f68a6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_fifteen_episodes = learner.gibbs_softmax_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.ActorCriticPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n",
+    "    episode_count=15,\n",
+    "    iteration_count=1000,\n",
+    "    exploit=False,\n",
+    "    render=False,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Actor-Critic fifteen\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0614bcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_exploit_fifteen = learner.gibbs_softmax_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=actor_critic_fifteen_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "exploit=True,\n",
+    "render=False,\n",
+    "plot_episodes_length=True,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Actor_Critic fifteen\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "723eba90",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Eligibility Trace agents/Notebooks/Actor Critic Testing With Defender.ipynb b/Eligibility Trace agents/Notebooks/Actor Critic Testing With Defender.ipynb
new file mode 100644
index 00000000..6f1d885e
--- /dev/null
+++ b/Eligibility Trace agents/Notebooks/Actor Critic Testing With Defender.ipynb	
@@ -0,0 +1,140 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19563726",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import logging\n",
+    "import gym\n",
+    "import importlib\n",
+    "\n",
+    "from cyberbattle._env.defender import ScanAndReimageCompromisedMachines\n",
+    "from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint\n",
+    "import cyberbattle.agents.baseline.plotting as p\n",
+    "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Actor-Critic')\n",
+    "import agent_actor_critic as a\n",
+    "import agent_wrapper as w\n",
+    "import learner as learner\n",
+    "from agent_wrapper import Verbosity\n",
+    "\n",
+    "importlib.reload(learner)\n",
+    "importlib.reload(p)\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2793e70b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n",
+    "                   attacker_goal=AttackerGoal(\n",
+    "                       #reward=430,\n",
+    "                       own_atleast_percent=1),\n",
+    "                  defender_constraint=DefenderConstraint(\n",
+    "                                         maintain_sla=0.80\n",
+    "                                     ),\n",
+    "                                     defender_agent=ScanAndReimageCompromisedMachines(\n",
+    "                                         probability=0.6,\n",
+    "                                         scan_capacity=2,\n",
+    "                                         scan_frequency=5))\n",
+    "\n",
+    "ep = w.EnvironmentBounds.of_identifiers(\n",
+    "    maximum_node_count=12,\n",
+    "    maximum_total_credentials=10,\n",
+    "    identifiers=gym_env.identifiers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cac15ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iteration_count = 1000\n",
+    "training_episode_count = 15"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5523c34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_with_defender = learner.gibbs_softmax_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.ActorCriticPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n",
+    "    episode_count=training_episode_count,\n",
+    "    iteration_count=iteration_count,\n",
+    "    exploit=False,\n",
+    "    render=False,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Actor-Critic with defender\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e5f58fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actor_critic_exploit_with_defender = learner.gibbs_softmax_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=actor_critic_with_defender['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=1000,\n",
+    "exploit=True,\n",
+    "render=False,\n",
+    "plot_episodes_length=True,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Actor_Critic with defender\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06793ce8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing Basic.ipynb b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing Basic.ipynb
new file mode 100644
index 00000000..187e4176
--- /dev/null
+++ b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing Basic.ipynb	
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fae4d125",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import logging\n",
+    "import gym\n",
+    "from cyberbattle._env.cyberbattle_env import AttackerGoal\n",
+    "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Sarsa(Lambda)')\n",
+    "import agent_sarsa_lambda as a\n",
+    "import agent_wrapper as w\n",
+    "import learner as learner\n",
+    "from agent_wrapper import Verbosity\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3928c9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n",
+    "                   attacker_goal=AttackerGoal(\n",
+    "                       reward=430,\n",
+    "                       own_atleast_percent=0.6))\n",
+    "\n",
+    "ep = w.EnvironmentBounds.of_identifiers(\n",
+    "    maximum_node_count=12,\n",
+    "    maximum_total_credentials=10,\n",
+    "    identifiers=gym_env.identifiers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fca12ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_five_episodes = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.SarsaLambdaPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=5,\n",
+    "    iteration_count=1000,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Sarsa_Lambda five\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c7cc52d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_exploit_five = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=sarsa_lambda_five_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Sarsa_Lambda five\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26cefa42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_ten_episodes = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.SarsaLambdaPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=10,\n",
+    "    iteration_count=1000,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Sarsa_Lambda ten\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "809d50f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_exploit_ten = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=sarsa_lambda_ten_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Sarsa_Lambda ten\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1f68a6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_fifteen_episodes = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.SarsaLambdaPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=15,\n",
+    "    iteration_count=1000,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Sarsa_Lambda fifteen\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0614bcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_exploit_fifteen = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=sarsa_lambda_fifteen_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Sarsa_Lambda fifteen\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09551250",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing With Defender.ipynb b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing With Defender.ipynb
new file mode 100644
index 00000000..6dea57de
--- /dev/null
+++ b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing With Defender.ipynb	
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19563726",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import logging\n",
+    "import gym\n",
+    "import importlib\n",
+    "\n",
+    "from cyberbattle._env.defender import ScanAndReimageCompromisedMachines\n",
+    "from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint\n",
+    "import cyberbattle.agents.baseline.plotting as p\n",
+    "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Sarsa(Lambda)')\n",
+    "import agent_sarsa_lambda as a\n",
+    "import agent_wrapper as w\n",
+    "import learner as learner\n",
+    "from agent_wrapper import Verbosity\n",
+    "\n",
+    "importlib.reload(learner)\n",
+    "importlib.reload(p)\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2793e70b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n",
+    "                   attacker_goal=AttackerGoal(\n",
+    "                       #reward=430,\n",
+    "                       own_atleast_percent=1),\n",
+    "                  defender_constraint=DefenderConstraint(\n",
+    "                                         maintain_sla=0.80\n",
+    "                                     ),\n",
+    "                                     defender_agent=ScanAndReimageCompromisedMachines(\n",
+    "                                         probability=0.6,\n",
+    "                                         scan_capacity=2,\n",
+    "                                         scan_frequency=5))\n",
+    "\n",
+    "ep = w.EnvironmentBounds.of_identifiers(\n",
+    "    maximum_node_count=12,\n",
+    "    maximum_total_credentials=10,\n",
+    "    identifiers=gym_env.identifiers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cac15ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iteration_count = 1000\n",
+    "training_episode_count = 15"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5523c34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_with_defender = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.SarsaLambdaPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=training_episode_count,\n",
+    "    iteration_count=iteration_count,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Sarsa_Lambda with defender\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e5f58fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarsa_lambda_exploit_with_defender = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=sarsa_lambda_with_defender['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=1000,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Sarsa_lambda with defender\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06793ce8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Eligibility Trace agents/Notebooks/Watkins Q Testing Basic.ipynb b/Eligibility Trace agents/Notebooks/Watkins Q Testing Basic.ipynb
new file mode 100644
index 00000000..90021ef6
--- /dev/null
+++ b/Eligibility Trace agents/Notebooks/Watkins Q Testing Basic.ipynb	
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fae4d125",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import logging\n",
+    "import gym\n",
+    "from cyberbattle._env.cyberbattle_env import AttackerGoal\n",
+    "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Watkins Q')\n",
+    "import agent_watkins_q as a\n",
+    "import agent_wrapper as w\n",
+    "import learner as learner\n",
+    "from agent_wrapper import Verbosity\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3928c9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n",
+    "                   attacker_goal=AttackerGoal(\n",
+    "                       reward=430,\n",
+    "                       own_atleast_percent=0.6))\n",
+    "\n",
+    "ep = w.EnvironmentBounds.of_identifiers(\n",
+    "    maximum_node_count=12,\n",
+    "    maximum_total_credentials=10,\n",
+    "    identifiers=gym_env.identifiers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fca12ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_five_episodes = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.WatkinsQPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=5,\n",
+    "    iteration_count=1000,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Watkins_Q five\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c7cc52d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_exploit_five = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=watkins_q_five_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Watkins_Q five\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26cefa42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_ten_episodes = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.WatkinsQPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=10,\n",
+    "    iteration_count=1000,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Watkins_Q ten\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "809d50f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_exploit_ten = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=watkins_q_ten_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Watkins_Q ten\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1f68a6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_fifteen_episodes = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.WatkinsQPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=15,\n",
+    "    iteration_count=1000,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Watkins_Q fifteen\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0614bcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_exploit_fifteen = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=watkins_q_fifteen_episodes['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=50,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Watkins_Q fifteen\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09551250",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Eligibility Trace agents/Notebooks/Watkins Q Testing With Defender.ipynb b/Eligibility Trace agents/Notebooks/Watkins Q Testing With Defender.ipynb
new file mode 100644
index 00000000..3d7c9cf6
--- /dev/null
+++ b/Eligibility Trace agents/Notebooks/Watkins Q Testing With Defender.ipynb	
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19563726",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import logging\n",
+    "import gym\n",
+    "import importlib\n",
+    "\n",
+    "from cyberbattle._env.defender import ScanAndReimageCompromisedMachines\n",
+    "from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint\n",
+    "import cyberbattle.agents.baseline.plotting as p\n",
+    "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Watkins Q')\n",
+    "import agent_watkins_q as a\n",
+    "import agent_wrapper as w\n",
+    "import learner as learner\n",
+    "from agent_wrapper import Verbosity\n",
+    "\n",
+    "importlib.reload(learner)\n",
+    "importlib.reload(p)\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2793e70b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n",
+    "                   attacker_goal=AttackerGoal(\n",
+    "                       #reward=430,\n",
+    "                       own_atleast_percent=1),\n",
+    "                  defender_constraint=DefenderConstraint(\n",
+    "                                         maintain_sla=0.80\n",
+    "                                     ),\n",
+    "                                     defender_agent=ScanAndReimageCompromisedMachines(\n",
+    "                                         probability=0.6,\n",
+    "                                         scan_capacity=2,\n",
+    "                                         scan_frequency=5))\n",
+    "\n",
+    "ep = w.EnvironmentBounds.of_identifiers(\n",
+    "    maximum_node_count=12,\n",
+    "    maximum_total_credentials=10,\n",
+    "    identifiers=gym_env.identifiers\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cac15ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iteration_count = 1000\n",
+    "training_episode_count = 15"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5523c34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_with_defender = learner.epsilon_greedy_search(\n",
+    "    gym_env,\n",
+    "    ep,\n",
+    "    learner=a.WatkinsQPolicy(\n",
+    "        ep,\n",
+    "        gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n",
+    "    episode_count=training_episode_count,\n",
+    "    iteration_count=iteration_count,\n",
+    "    epsilon=0.9,\n",
+    "    render=False,\n",
+    "    epsilon_exponential_decay=1000,\n",
+    "    epsilon_minimum=0.01,\n",
+    "    verbosity=Verbosity.Quiet,\n",
+    "    title=\"Watkins_Q with defender\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e5f58fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "watkins_q_exploit_with_defender = learner.epsilon_greedy_search(\n",
+    "gym_env,\n",
+    "ep,\n",
+    "learner=watkins_q_with_defender['learner'],\n",
+    "episode_count=10,\n",
+    "iteration_count=1000,\n",
+    "epsilon=0,\n",
+    "render=False,\n",
+    "epsilon_minimum=0,\n",
+    "verbosity=Verbosity.Quiet,\n",
+    "title=\"Exploiting Watkins_Q with defender\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06793ce8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Eligibility Trace agents/Sarsa(lambda)/.DS_Store b/Eligibility Trace agents/Sarsa(lambda)/.DS_Store
new file mode 100644
index 00000000..07af60bf
Binary files /dev/null and b/Eligibility Trace agents/Sarsa(lambda)/.DS_Store differ
diff --git a/Eligibility Trace agents/Sarsa(lambda)/agent_sarsa_lambda.py b/Eligibility Trace agents/Sarsa(lambda)/agent_sarsa_lambda.py
new file mode 100644
index 00000000..1423ee6d
--- /dev/null
+++ b/Eligibility Trace agents/Sarsa(lambda)/agent_sarsa_lambda.py	
@@ -0,0 +1,256 @@
+import random
+from typing import NamedTuple, Optional, Tuple, Union, List
+import numpy as np
+from numpy import ndarray
+import logging
+import boolean
+
+from cyberbattle._env import cyberbattle_env
+from agent_wrapper import EnvironmentBounds
+from gym import spaces, Wrapper
+
+import agent_wrapper as w
+from learner import Learner
+
+from torch import Tensor
+import torch
+
+class StateActionModel:
+
+    def __init__(self, ep: EnvironmentBounds):
+        self.ep = ep
+
+        self.global_features = w.ConcatFeatures(ep, [
+            w.Feature_discovered_not_owned_nodes_sliding(ep),
+            w.Feature_discovered_credential_count(ep)
+        ])
+
+        self.source_node_features = w.ConcatFeatures(ep, [
+            w.Feature_active_node_properties(ep),
+            w.Feature_success_actions_at_node(ep)
+        ])
+
+        self.target_node_features = w.ConcatFeatures(ep, [
+            w.Feature_active_node_id(ep)
+        ])
+
+        self.state_space = w.ConcatFeatures(ep, self.global_features.feature_selection +
+                                            self.source_node_features.feature_selection +
+                                            self.target_node_features.feature_selection)
+
+        self.action_space = w.AbstractAction(ep)
+
+    def valid_actions(self, wrapped_env: w.AgentWrapper, observation):
+        """returns a list of valid actions and the nodes they can be carried out from"""
+
+        nodes_and_actions = []
+        discovered_nodes = np.union1d(w.owned_nodes(observation), w.discovered_nodes_notowned(observation))
+
+        for from_node in w.owned_nodes(observation):
+            for local_action in range(self.action_space.n_local_actions):
+                trial_action = self.action_space.abstract_to_gymaction(from_node, observation, local_action, None)
+                if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                    nodes_and_actions.append((from_node, local_action, -1))
+
+            for remote_action in range(self.action_space.n_local_actions, self.action_space.n_local_actions + self.action_space.n_remote_actions):
+                for target_node in discovered_nodes:
+                    if target_node != from_node:
+                        trial_action = self.action_space.abstract_to_gymaction(from_node, observation, remote_action, target_node)
+                        if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                            nodes_and_actions.append((from_node, remote_action, target_node))
+
+            for connect_action in range(self.action_space.n_local_actions + self.action_space.n_remote_actions, self.action_space.n_actions):
+                trial_action = self.action_space.abstract_to_gymaction(from_node, observation, connect_action, None)
+                if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                    nodes_and_actions.append((from_node, connect_action, -1))
+
+        return nodes_and_actions
+
+class Memory:
+
+    def __init__(self, ep:EnvironmentBounds, hash_size):
+        self.hash_size = hash_size
+
+        self.memory = torch.zeros([2, hash_size], dtype=torch.float64)
+
+    def state_action_index(self, state_space, abstract_action):
+        """Turns a state action pair into an index for the memory tensor"""
+        feature_vector = np.append(state_space, abstract_action)
+        hash_number = abs(hash(str(feature_vector)))
+        return hash_number % self.hash_size
+
+class ChosenActionMetadata(NamedTuple):
+    
+    abstract_action: np.int32
+    actor_node: int
+    actor_features: ndarray
+    actor_state: ndarray
+
+    def __repr__(self) -> str:
+        return f"[abstract_action={self.abstract_action}, actor={self.actor_node}, state={self.actor_state}]"
+
+class SarsaLambdaPolicy(Learner):
+
+    def __init__(self,
+                 ep: EnvironmentBounds,
+                 gamma: float,
+                 λ: float,
+                 learning_rate: float,
+                 hash_size: int
+                 ):
+        
+        self.model = StateActionModel(ep)
+        self.n_local_actions = ep.local_attacks_count
+        self.n_remote_actions = ep.remote_attacks_count
+        self.gamma = gamma
+        self.λ = λ
+        self.learning_rate = learning_rate
+        self.hash_size = hash_size
+
+        self.memory = Memory(ep, hash_size=hash_size)
+
+    def parameters_as_string(self):
+        return f'γ={self.gamma}, lr={self.learning_rate}, λ={self.λ},\n' \
+               f'hash_size={self.hash_size}'
+
+    def all_parameters_as_string(self) -> str:
+        model = self.model
+        return f'{self.parameters_as_string()}\n' \
+            f'dimension={model.state_space.flat_size()}x{model.action_space.flat_size()}, ' \
+            f'Q={[f.name() for f in model.state_space.feature_selection]} ' \
+            f"-> 'abstract_action'"
+
+    def get_actor_state_vector(self, global_state: ndarray, actor_features: ndarray, target_features: Optional[ndarray]) -> ndarray:
+        """Turns seperate state features into one vector"""
+        if target_features is None:
+            return np.concatenate((np.array(global_state, dtype=np.float32),
+                                np.array(actor_features, dtype=np.float32)))
+        else:
+            return np.concatenate((np.array(global_state, dtype=np.float32),
+                                np.array(actor_features, dtype=np.float32),
+                                np.array(target_features, dtype=np.float32)))
+
+    def update_memory(self, 
+                    reward: float,
+                    actor_state: ndarray,
+                    abstract_action: int,
+                    next_actor_state: Optional[ndarray],
+                    next_abstract_action: Optional[int]):
+        
+        current_state_action_index = self.memory.state_action_index(actor_state, abstract_action)
+        if next_actor_state is None:
+            δ = reward - self.memory.memory[0][current_state_action_index].item()
+        else:
+            next_state_action_index = self.memory.state_action_index(next_actor_state, next_abstract_action)
+            δ = reward + (self.gamma * self.memory.memory[0][next_state_action_index].item()) - self.memory.memory[0][current_state_action_index].item()
+
+        self.memory.memory[1][current_state_action_index] +=  1
+
+        non_zero_indicies_q = torch.argwhere(self.memory.memory[0]).numpy()
+        non_zero_indicies_e = torch.argwhere(self.memory.memory[1]).numpy()
+        non_zero_indicies = np.union1d(non_zero_indicies_q, non_zero_indicies_e)
+
+        for i in non_zero_indicies:
+
+            self.memory.memory[0][i] = self.memory.memory[0][i].item() + float(self.learning_rate * δ * self.memory.memory[1][i].item())
+            self.memory.memory[0][i] = round(self.memory.memory[0][i].item(), 5)
+            self.memory.memory[0][i] = max(0, self.memory.memory[0][i].item())
+            self.memory.memory[0][i] = min(100, self.memory.memory[0][i].item())
+
+            self.memory.memory[1][i] = self.memory.memory[1][i].item() * float(self.gamma * self.λ)
+            self.memory.memory[1][i] = round(self.memory.memory[0][i].item(), 5)
+
+    def on_step(self, wrapped_env: w.AgentWrapper,
+                observation, reward: float, done: bool, action_metadata, epsilon):
+
+        if done:
+            self.update_memory(reward,
+                            actor_state=action_metadata.actor_state,
+                            abstract_action=action_metadata.abstract_action,
+                            next_actor_state=None,
+                            next_abstract_action=None            
+                            )
+        else:
+            x = np.random.rand()
+            if x <= epsilon:
+                _, _, future_action_metadata = self.explore(wrapped_env)
+            else:
+                _, _, future_action_metadata = self.exploit(wrapped_env, observation)
+        
+            self.update_memory(reward,
+                            actor_state=action_metadata.actor_state,
+                            abstract_action=action_metadata.abstract_action,
+                            next_actor_state=future_action_metadata.actor_state,
+                            next_abstract_action=future_action_metadata.abstract_action
+                            )
+        
+
+    def new_episode(self):
+        torch.mul(self.memory.memory[1], 0)
+
+    def end_of_episode(self, i_episode, t):
+        return None
+
+    def end_of_iteration(self, t, done):
+        return None
+
+    def metadata_from_gymaction(self, wrapped_env, gym_action):
+        """Takes in a gym action and returns it's metadata"""
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        actor_node = cyberbattle_env.sourcenode_of_action(gym_action)
+        actor_features = self.model.source_node_features.get(wrapped_env.state, actor_node)
+        abstract_action = self.model.action_space.abstract_from_gymaction(gym_action)
+
+        if 'remote_vulnerability' in gym_action:
+            target_node = self.model.target_node_features.get(wrapped_env.state, gym_action['remote_vulnerability'][1])
+        else:
+            target_node = None
+
+        return ChosenActionMetadata(
+            abstract_action=abstract_action,
+            actor_node=actor_node,
+            actor_features=actor_features,
+            actor_state=self.get_actor_state_vector(current_global_state, actor_features, target_node))
+
+    def stateaction_as_string(self, action_metadata) -> str:
+        return ''
+
+    def explore(self, wrapped_env: w.AgentWrapper
+                ) -> Tuple[str, cyberbattle_env.Action, object]:
+
+        gym_action = wrapped_env.env.sample_valid_action(kinds=[0, 1, 2])
+        metadata = self.metadata_from_gymaction(wrapped_env, gym_action)
+        return gym_action, metadata
+
+    def exploit(self, wrapped_env: w.AgentWrapper, observation) -> Tuple[str, Optional[cyberbattle_env.Action], object]:
+
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation)
+
+        #The q_values are the estimated returns from an action taken in the current state
+        q_values = []
+        for item in valid_nodes_and_actions:
+            source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0])
+
+            if item[1] < self.n_local_actions or item[1] - self.n_local_actions > self.n_remote_actions:
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None)
+            else:
+                target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2])
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features)
+
+            action_state_index = self.memory.state_action_index(actor_state_vector, item[1])
+
+            q_values.append(self.memory.memory[0][action_state_index].item())
+
+        indicies_of_chosen_actions = [i for i, x in enumerate(q_values) if x == max(q_values)]
+        chosen_action_index = random.choice(indicies_of_chosen_actions)
+        chosen_action = valid_nodes_and_actions[chosen_action_index]
+
+        if chosen_action[1] < self.n_local_actions or chosen_action[1] - self.n_local_actions > self.n_remote_actions:
+            gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], None)
+        else:
+            gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], chosen_action[2])
+
+        metadata = self.metadata_from_gymaction(wrapped_env, gym_action)
+        
+        return gym_action, metadata
\ No newline at end of file
diff --git a/Eligibility Trace agents/Sarsa(lambda)/agent_wrapper.py b/Eligibility Trace agents/Sarsa(lambda)/agent_wrapper.py
new file mode 100644
index 00000000..985aa1e9
--- /dev/null
+++ b/Eligibility Trace agents/Sarsa(lambda)/agent_wrapper.py	
@@ -0,0 +1,342 @@
+from cyberbattle._env.cyberbattle_env import EnvironmentBounds
+from typing import Optional, List
+import enum
+import numpy as np
+from gym import spaces, Wrapper
+from numpy import ndarray
+import cyberbattle._env.cyberbattle_env as cyberbattle_env
+import logging
+
+
+class StateAugmentation:
+    """Default agent state augmentation, consisting of the gym environment
+    observation itself and nothing more."""
+
+    def __init__(self, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+    def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+    def on_reset(self, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+
+class Feature(spaces.MultiDiscrete):
+    """
+    Feature consisting of multiple discrete dimensions.
+    Parameters:
+        nvec: is a vector defining the number of possible values
+        for each discrete space.
+    """
+
+    def __init__(self, env_properties: EnvironmentBounds, nvec):
+        self.env_properties = env_properties
+        super().__init__(nvec)
+
+    def flat_size(self):
+        return np.prod(self.nvec)
+
+    def name(self):
+        """Return the name of the feature"""
+        p = len(type(Feature(self.env_properties, [])).__name__) + 1
+        return type(self).__name__[p:]
+
+    def get(self, a: StateAugmentation, node: Optional[int]) -> np.ndarray:
+        """Compute the current value of a feature value at
+        the current observation and specific node"""
+        raise NotImplementedError
+
+    def pretty_print(self, v):
+        return v
+
+class Feature_active_node_properties(Feature):
+    """Bitmask of all properties set for the active node"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [2] * p.property_count)
+
+    def get(self, a: StateAugmentation, node) -> ndarray:
+        assert node is not None, 'feature only valid in the context of a node'
+
+        node_prop = a.observation['discovered_nodes_properties']
+
+        # list of all properties set/unset on the node
+        # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0)
+        assert node < len(node_prop), f'invalid node index {node} (not discovered yet)'
+        remapped = np.array((1 + node_prop[node]) / 2, dtype=int)
+        return remapped
+
+
+class Feature_active_node_id(Feature):
+    """Return the node id itself"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count] * 1)
+
+    def get(self, a: StateAugmentation, node) -> ndarray:
+        return np.array([node], dtype=int)
+
+
+class Feature_discovered_credential_count(Feature):
+    """number of credentials discovered so far"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_total_credentials + 1])
+
+    def get(self, a: StateAugmentation, node):
+        return [len(a.observation['credential_cache_matrix'])]
+
+
+class Feature_discovered_not_owned_nodes_sliding(Feature):
+    """array of which of discovered nodes not owned by name"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count + 1])
+
+    def get(self, a: StateAugmentation, node):
+        discovered = a.observation['discovered_nodes']
+        levels = a.observation['nodes_privilegelevel']
+        owned_nodes_indices = np.where(levels > 0)[0]
+        owned = []
+        for i in owned_nodes_indices:
+            owned.append(discovered[i])
+        discovered_not_owned = []
+        for node in discovered:
+            if node not in owned:
+                discovered_not_owned.append(node)
+        discovered_not_owned_sliding = np.zeros(self.env_properties.maximum_node_count, np.int32)
+        for node_id in discovered_not_owned:
+            if node_id == 'client':
+                discovered_not_owned_sliding[0] = 1
+            elif node_id == 'Website':
+                discovered_not_owned_sliding[1] = 1
+            elif node_id == 'Website.Directory':
+                discovered_not_owned_sliding[2] = 1
+            elif node_id == 'Website[user=monitor]':
+                discovered_not_owned_sliding[3] = 1
+            elif node_id == 'GitHubProject':
+                discovered_not_owned_sliding[4] = 1
+            elif node_id == 'AzureStorage':
+                discovered_not_owned_sliding[5] = 1
+            elif node_id == 'Sharepoint':
+                discovered_not_owned_sliding[6] = 1
+            elif node_id == 'AzureResourceManager':
+                discovered_not_owned_sliding[7] = 1
+            elif node_id == 'AzureResourceManager[user-monitor]':
+                discovered_not_owned_sliding[8] = 1
+            elif node_id == 'AzureVM':
+                discovered_not_owned_sliding[9] = 1
+        return discovered_not_owned_sliding
+        
+class Feature_active_node_id(Feature):
+    """number asigned to each type of node in toy-ctf"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count + 1])
+
+    def get(self, a: StateAugmentation, node):
+        node_id = a.observation['discovered_nodes'][node]
+        
+        node_id_array = np.zeros(1, np.int32)
+        if node_id == 'client':
+            node_id_array[0] = 0
+        elif node_id == 'Website':
+            node_id_array[0] = 1
+        elif node_id == 'Website.Directory':
+            node_id_array[0] = 2
+        elif node_id == 'Website[user=monitor]':
+            node_id_array[0] = 3
+        elif node_id == 'GitHubProject':
+            node_id_array[0] = 4
+        elif node_id == 'AzureStorage':
+            node_id_array[0] = 5
+        elif node_id == 'Sharepoint':
+            node_id_array[0] = 6
+        elif node_id == 'AzureResourceManager':
+            node_id_array[0] = 7
+        elif node_id == 'AzureResourceManager[user-monitor]':
+            node_id_array[0] = 8
+        elif node_id == 'AzureVM':
+            node_id_array[0] = 9
+        else:
+            node_id_array[0] = 10
+        return node_id_array
+
+
+class ConcatFeatures(Feature):
+    """ Concatenate a list of features into a single feature
+    Parameters:
+        feature_selection - a selection of features to combine
+    """
+
+    def __init__(self, p: EnvironmentBounds, feature_selection: List[Feature]):
+        self.feature_selection = feature_selection
+        self.dim_sizes = np.concatenate([f.nvec for f in feature_selection])
+        super().__init__(p, [self.dim_sizes])
+
+    def pretty_print(self, v):
+        return v
+
+    def get(self, a: StateAugmentation, node=None) -> np.ndarray:
+        """Return the feature vector"""
+        feature_vector = [f.get(a, node) for f in self.feature_selection]
+        return np.concatenate(feature_vector)
+
+
+def owned_nodes(observation):
+    """Return the list of owned nodes"""
+    return np.nonzero(observation['nodes_privilegelevel'])[0]
+
+
+def discovered_nodes_notowned(observation):
+    """Return the list of discovered nodes that are not owned yet"""
+    return np.nonzero(observation['nodes_privilegelevel'] == 0)[0]
+
+
+class AbstractAction(Feature):
+    """An abstraction of the gym state space that reduces
+    the space dimension for learning use to just
+        - local_attack(vulnid)    (source_node provided)
+        - remote_attack(vulnid)   (source_node provided, target_node forgotten)
+        - connect(port)           (source_node provided, target_node forgotten, credentials infered from cache)
+    """
+
+    def __init__(self, p: EnvironmentBounds):
+        self.n_local_actions = p.local_attacks_count
+        self.n_remote_actions = p.remote_attacks_count
+        self.n_connect_actions = p.port_count
+        self.n_actions = self.n_local_actions + self.n_remote_actions + self.n_connect_actions
+        super().__init__(p, [self.n_actions])
+
+    def abstract_to_gymaction(self, source_node, observation, abstract_action, target_node):
+        """Takes a statring node and an abstract action number and returns a gym action"""
+
+        if abstract_action < self.n_local_actions:
+            vuln = abstract_action
+            return {'local_vulnerability': np.array([source_node, vuln])}
+
+        node_prop = observation['discovered_nodes_properties']
+        abstract_action -= self.n_local_actions
+        if abstract_action < self.n_remote_actions:
+            vuln = abstract_action
+
+            discovered_nodes_count = len(node_prop)
+            if discovered_nodes_count <= 1:
+                return None
+
+            return {'remote_vulnerability': np.array([source_node, target_node, vuln])}
+
+        abstract_action -= self.n_remote_actions
+        port = np.int32(abstract_action)
+
+        discovered_credentials = np.array(observation['credential_cache_matrix'])
+        n_discovered_creds = len(discovered_credentials)
+        if n_discovered_creds <= 0:
+            return None
+
+        nodes_not_owned = discovered_nodes_notowned(observation)
+        match_port = discovered_credentials[:, 1] == port
+        match_port_indicies = np.where(match_port)[0]
+
+        credential_indices_choices = [c for c in match_port_indicies
+                                        if discovered_credentials[c, 0] in nodes_not_owned]
+
+        if credential_indices_choices:
+            logging.debug('found matching cred in the credential cache')
+        else:
+            logging.debug('no cred matching requested port, trying instead creds used to access other ports')
+            credential_indices_choices = [i for (i, n) in enumerate(discovered_credentials[:, 0])
+                                          if n in nodes_not_owned]
+
+            if credential_indices_choices:
+                logging.debug('found cred in the credential cache without matching port name')
+            else:
+                logging.debug('no cred to use from the credential cache')
+                return None
+
+        cred = np.int32(np.random.choice(credential_indices_choices))
+        target = np.int32(discovered_credentials[cred, 0])
+        return {'connect': np.array([source_node, target, port, cred], dtype=np.int32)}
+
+    def abstract_from_gymaction(self, gym_action: cyberbattle_env.Action) -> np.int32:
+        """Turns a gym action into it's abstract action number"""
+        if 'local_vulnerability' in gym_action:
+            return gym_action['local_vulnerability'][1]
+        elif 'remote_vulnerability' in gym_action:
+            r = gym_action['remote_vulnerability']
+            return self.n_local_actions + r[2]
+
+        assert 'connect' in gym_action
+        c = gym_action['connect']
+
+        a = self.n_local_actions + self.n_remote_actions + c[2]
+        assert a < self.n_actions
+        return np.int32(a)
+
+
+class ActionTrackingStateAugmentation(StateAugmentation):
+    """An agent state augmentation consisting of
+    the environment observation augmented with the following dynamic information:
+       - success_action_count: count of action taken and succeeded at the current node
+       - failed_action_count: count of action taken and failed at the current node
+     """
+
+    def __init__(self, p: EnvironmentBounds, observation: cyberbattle_env.Observation):
+        self.aa = AbstractAction(p)
+        self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.env_properties = p
+        super().__init__(observation)
+
+    def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation):
+        node = cyberbattle_env.sourcenode_of_action(action)
+        abstract_action = self.aa.abstract_from_gymaction(action)
+        if reward > 0:
+            self.success_action_count[node, abstract_action] += 1
+        else:
+            self.failed_action_count[node, abstract_action] += 1
+        super().on_step(action, reward, done, observation)
+
+    def on_reset(self, observation: cyberbattle_env.Observation):
+        p = self.env_properties
+        self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        super().on_reset(observation)
+
+
+class Feature_success_actions_at_node(Feature):
+    """number of time each action succeeded at a given node"""
+
+    max_action_count = 100
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [self.max_action_count] * AbstractAction(p).n_actions)
+
+    def get(self, a: ActionTrackingStateAugmentation, node: int):
+        return np.minimum(a.success_action_count[node, :], self.max_action_count - 1)
+
+
+class Verbosity(enum.Enum):
+    """Verbosity of the learning function"""
+    Quiet = 0
+    Normal = 1
+    Verbose = 2
+
+
+class AgentWrapper(Wrapper):
+    """Gym wrapper to update the agent state on every step"""
+
+    def __init__(self, env: cyberbattle_env.CyberBattleEnv, state: StateAugmentation):
+        super().__init__(env)
+        self.state = state
+
+    def step(self, action: cyberbattle_env.Action):
+        observation, reward, done, info = self.env.step(action)
+        self.state.on_step(action, reward, done, observation)
+        return observation, reward, done, info
+
+    def reset(self):
+        observation = self.env.reset()
+        self.state.on_reset(observation)
+        return observation
\ No newline at end of file
diff --git a/Eligibility Trace agents/Sarsa(lambda)/learner.py b/Eligibility Trace agents/Sarsa(lambda)/learner.py
new file mode 100644
index 00000000..7553e66e
--- /dev/null
+++ b/Eligibility Trace agents/Sarsa(lambda)/learner.py	
@@ -0,0 +1,269 @@
+import math
+import sys
+
+from plotting import PlotTraining, plot_averaged_cummulative_rewards
+from agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation
+import logging
+import numpy as np
+from cyberbattle._env import cyberbattle_env
+from typing import Tuple, Optional, TypedDict, List
+import progressbar
+import abc
+
+class Agent(abc.ABC):
+
+    @abc.abstractmethod
+    def explore(self, wrapped_env: AgentWrapper) -> Tuple[cyberbattle_env.Action, object]:
+        """Exploration function.
+        Returns (action_type, gym_action, action_metadata) where
+        action_metadata is a custom object that gets passed to the on_step callback function"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[Optional[cyberbattle_env.Action], object]:
+        """Exploit function.
+        Returns (action_type, gym_action, action_metadata) where
+        action_metadata is a custom object that gets passed to the on_step callback function"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def on_step(self, wrapped_env: AgentWrapper, observation, reward, done, action_metadata, epsilon) -> None:
+        raise NotImplementedError
+
+    def parameters_as_string(self) -> str:
+        return ''
+
+    def all_parameters_as_string(self) -> str:
+        return ''
+
+    def loss_as_string(self) -> str:
+        return ''
+
+    def stateaction_as_string(self, action_metadata) -> str:
+        return ''
+
+Breakdown = TypedDict('Breakdown', {
+    'local': int,
+    'remote': int,
+    'connect': int
+})
+
+Outcomes = TypedDict('Outcomes', {
+    'reward': Breakdown,
+    'noreward': Breakdown
+})
+
+Stats = TypedDict('Stats', {
+    'exploit': Outcomes,
+    'explore': Outcomes
+})
+
+TrainedAgent = TypedDict('TrainedAgent', {
+    'all_episodes_rewards': List[List[float]],
+    'all_episodes_availability': List[List[float]],
+    'agent': Agent,
+    'trained_on': str,
+    'title': str
+})
+
+def print_stats(stats):
+    """Print learning statistics"""
+    def print_breakdown(stats, actiontype: str):
+        def ratio(kind: str) -> str:
+            x, y = stats[actiontype]['reward'][kind], stats[actiontype]['noreward'][kind]
+            sum = x + y
+            if sum == 0:
+                return 'NaN'
+            else:
+                return f"{(x / sum):.2f}"
+
+        def print_kind(kind: str):
+            print(
+                f"    {actiontype}-{kind}: {stats[actiontype]['reward'][kind]}/{stats[actiontype]['noreward'][kind]} "
+                f"({ratio(kind)})")
+        print_kind('local')
+        print_kind('remote')
+        print_kind('connect')
+
+    print("  Breakdown [Reward/NoReward (Success rate)]")
+    print_breakdown(stats, 'explore')
+    print_breakdown(stats, 'exploit')
+
+def epsilon_greedy_search(
+    cyberbattle_gym_env: cyberbattle_env.CyberBattleEnv,
+    environment_properties: EnvironmentBounds,
+    agent: Agent,
+    title: str,
+    episode_count: int,
+    iteration_count: int,
+    epsilon: float,
+    epsilon_minimum=0.0,
+    epsilon_multdecay: Optional[float] = None,
+    epsilon_exponential_decay: Optional[int] = None,
+    render=True,
+    render_last_episode_rewards_to: Optional[str] = None,
+    verbosity: Verbosity = Verbosity.Normal,
+    plot_episodes_length=True
+) -> TrainedAgent:
+
+    print(f"###### {title}\n"
+          f"Learning with: episode_count={episode_count},"
+          f"iteration_count={iteration_count},"
+          f"ϵ={epsilon},"
+          f'ϵ_min={epsilon_minimum}, '
+          + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '')
+          + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') +
+          f"{agent.parameters_as_string()}")
+
+    initial_epsilon = epsilon
+
+    all_episodes_rewards = []
+    all_episodes_availability = []
+
+    wrapped_env = AgentWrapper(cyberbattle_gym_env,
+                               ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset()))
+    steps_done = 0
+    plot_title = f"{title} (epochs={episode_count}, ϵ={initial_epsilon}, ϵ_min={epsilon_minimum}," \
+        + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') \
+        + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') \
+        + agent.parameters_as_string()
+    plottraining = PlotTraining(title=plot_title, render_each_episode=render)
+
+    render_file_index = 1
+
+    for i_episode in range(1, episode_count + 1):
+
+        print(f"  ## Episode: {i_episode}/{episode_count} '{title}' "
+              f"ϵ={epsilon:.4f}, "
+              f"{agent.parameters_as_string()}")
+
+        observation = wrapped_env.reset()
+        total_reward = 0.0
+        all_rewards = []
+        all_availability = []
+        agent.new_episode()
+
+        stats = Stats(exploit=Outcomes(reward=Breakdown(local=0, remote=0, connect=0),
+                                       noreward=Breakdown(local=0, remote=0, connect=0)),
+                      explore=Outcomes(reward=Breakdown(local=0, remote=0, connect=0),
+                                       noreward=Breakdown(local=0, remote=0, connect=0))
+                      )
+
+        episode_ended_at = None
+        sys.stdout.flush()
+
+        bar = progressbar.ProgressBar(
+            widgets=[
+                'Episode ',
+                f'{i_episode}',
+                '|Iteration ',
+                progressbar.Counter(),
+                '|',
+                progressbar.Variable(name='reward', width=6, precision=10),
+                '|',
+                progressbar.Variable(name='last_reward_at', width=4),
+                '|',
+                progressbar.Timer(),
+                progressbar.Bar()
+            ],
+            redirect_stdout=False)
+
+        for t in bar(range(1, 1 + iteration_count)):
+
+            if epsilon_exponential_decay:
+                epsilon = epsilon_minimum + math.exp(-1. * steps_done /
+                                                     epsilon_exponential_decay) * (initial_epsilon - epsilon_minimum)
+
+            steps_done += 1
+
+            x = np.random.rand()
+            if x <= epsilon:
+                gym_action, action_metadata = agent.explore(wrapped_env)
+                action_style = "explore"
+            else:
+                gym_action, action_metadata = agent.exploit(wrapped_env, observation)
+                action_style = "exploit"
+
+            # Take the step
+            logging.debug(f"gym_action={gym_action}, action_metadata={action_metadata}")
+            observation, reward, done, info = wrapped_env.step(gym_action)
+
+            action_type = 'exploit' if action_style == 'exploit' else 'explore'
+            outcome = 'reward' if reward > 0 else 'noreward'
+            if 'local_vulnerability' in gym_action:
+                stats[action_type][outcome]['local'] += 1
+            elif 'remote_vulnerability' in gym_action:
+                stats[action_type][outcome]['remote'] += 1
+            else:
+                stats[action_type][outcome]['connect'] += 1
+
+            agent.on_step(wrapped_env, observation, reward, done, action_metadata, epsilon)
+            assert np.shape(reward) == ()
+
+            all_rewards.append(reward)
+            all_availability.append(info['network_availability'])
+            total_reward += reward
+            bar.update(t, reward=total_reward)
+            if reward > 0:
+                bar.update(t, last_reward_at=t)
+
+            if verbosity == Verbosity.Verbose or (verbosity == Verbosity.Normal and reward > 0):
+                sign = ['-', '+'][reward > 0]
+
+                print(f"    {sign} t={t} {action_style} r={reward} cum_reward:{total_reward} "
+                      f"a={action_metadata}-{gym_action} "
+                      f"creds={len(observation['credential_cache_matrix'])} "
+                      f" {agent.stateaction_as_string(action_metadata)}")
+
+            if i_episode == episode_count \
+                    and render_last_episode_rewards_to is not None \
+                    and reward > 0:
+                fig = cyberbattle_gym_env.render_as_fig()
+                fig.write_image(f"{render_last_episode_rewards_to}-e{i_episode}-{render_file_index}.png")
+                render_file_index += 1
+
+            agent.end_of_iteration(t, done)
+
+            if done:
+                episode_ended_at = t
+                bar.finish(dirty=True)
+                break
+
+        sys.stdout.flush()
+
+        loss_string = agent.loss_as_string()
+        if loss_string:
+            loss_string = "loss={loss_string}"
+
+        if episode_ended_at:
+            print(f"  Episode {i_episode} ended at t={episode_ended_at} {loss_string}")
+        else:
+            print(f"  Episode {i_episode} stopped at t={iteration_count} {loss_string}")
+
+        print_stats(stats)
+
+        all_episodes_rewards.append(all_rewards)
+        all_episodes_availability.append(all_availability)
+
+        length = episode_ended_at if episode_ended_at else iteration_count
+        agent.end_of_episode(i_episode=i_episode, t=length)
+        if plot_episodes_length:
+            plottraining.episode_done(length)
+        if render:
+            wrapped_env.render()
+
+        if epsilon_multdecay:
+            epsilon = max(epsilon_minimum, epsilon * epsilon_multdecay)
+
+    wrapped_env.close()
+    print("simulation ended")
+    if plot_episodes_length:
+        plottraining.plot_end()
+
+    return TrainedAgent(
+        all_episodes_rewards=all_episodes_rewards,
+        all_episodes_availability=all_episodes_availability,
+        agent=agent,
+        trained_on=cyberbattle_gym_env.name,
+        title=plot_title
+    )
\ No newline at end of file
diff --git a/Eligibility Trace agents/Sarsa(lambda)/plotting.py b/Eligibility Trace agents/Sarsa(lambda)/plotting.py
new file mode 100644
index 00000000..e51b4a55
--- /dev/null
+++ b/Eligibility Trace agents/Sarsa(lambda)/plotting.py	
@@ -0,0 +1,203 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Plotting helpers for agent banchmarking"""
+
+import matplotlib.pyplot as plt  # type:ignore
+import numpy as np
+
+
+def new_plot(title):
+    """Prepare a new plot of cumulative rewards"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('cumulative reward', fontsize=20)
+    plt.xlabel('step', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title(title, fontsize=12)
+
+
+def pad(array, length):
+    """Pad an array with 0s to make it of desired length"""
+    padding = np.zeros((length,))
+    padding[:len(array)] = array
+    return padding
+
+
+def plot_episodes_rewards_averaged(results):
+    """Plot cumulative rewards for a given set of specified episodes"""
+    max_iteration_count = np.max([len(r) for r in results['all_episodes_rewards']])
+
+    all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in results['all_episodes_rewards']]
+    cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1)
+    avg = np.average(cumrewards, axis=0)
+    std = np.std(cumrewards, axis=0)
+    x = [i for i in range(len(std))]
+    plt.plot(x, avg, label=results['title'])
+    plt.fill_between(x, avg - std, avg + std, alpha=0.5)
+
+
+def fill_with_latest_value(array, length):
+    pad = length - len(array)
+    if pad > 0:
+        return np.pad(array, (0, pad), mode='edge')
+    else:
+        return array
+
+
+def plot_episodes_availability_averaged(results):
+    """Plot availability for a given set of specified episodes"""
+    data = results['all_episodes_availability']
+    longest_episode_length = np.max([len(r) for r in data])
+
+    all_episodes_padded = [fill_with_latest_value(av, longest_episode_length) for av in data]
+    avg = np.average(all_episodes_padded, axis=0)
+    std = np.std(all_episodes_padded, axis=0)
+    x = [i for i in range(len(std))]
+    plt.plot(x, avg, label=results['title'])
+    plt.fill_between(x, avg - std, avg + std, alpha=0.5)
+
+
+def plot_episodes_length(learning_results):
+    """Plot length of every episode"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('#iterations', fontsize=20)
+    plt.xlabel('episode', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title("Length of each episode", fontsize=12)
+
+    for results in learning_results:
+        iterations = [len(e) for e in results['all_episodes_rewards']]
+        episode = [i for i in range(len(results['all_episodes_rewards']))]
+        plt.plot(episode, iterations, label=f"{results['title']}")
+
+    plt.legend(loc="upper right")
+    plt.show()
+
+
+def plot_each_episode(results):
+    """Plot cumulative rewards for each episode"""
+    for i, episode in enumerate(results['all_episodes_rewards']):
+        cumrewards = np.cumsum(episode)
+        x = [i for i in range(len(cumrewards))]
+        plt.plot(x, cumrewards, label=f'Episode {i}')
+
+
+def plot_all_episodes(r):
+    """Plot cumulative rewards for every episode"""
+    new_plot(r['title'])
+    plot_each_episode(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def plot_averaged_cummulative_rewards(title, all_runs):
+    """Plot averaged cumulative rewards"""
+    new_plot(title)
+    for r in all_runs:
+        plot_episodes_rewards_averaged(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def plot_averaged_availability(title, all_runs):
+    """Plot averaged network availability"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('network availability', fontsize=20)
+    plt.xlabel('step', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title(title, fontsize=12)
+    for r in all_runs:
+        plot_episodes_availability_averaged(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def new_plot_loss():
+    """Plot MSE loss averaged over all episodes"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('loss', fontsize=20)
+    plt.xlabel('episodes', fontsize=20)
+    plt.xticks(size=12)
+    plt.yticks(size=20)
+    plt.title("Loss", fontsize=12)
+
+
+def plot_all_episodes_loss(all_episodes_losses, name, label):
+    """Plot loss for one learning episode"""
+    x = [i for i in range(len(all_episodes_losses))]
+    plt.plot(x, all_episodes_losses, label=f'{name} {label}')
+
+
+def running_mean(x, size):
+    """return moving average of x for a window of lenght 'size'"""
+    cumsum = np.cumsum(np.insert(x, 0, 0))
+    return (cumsum[size:] - cumsum[:-size]) / float(size)
+
+
+class PlotTraining:
+    """Plot training-related stats"""
+
+    def __init__(self, title, render_each_episode):
+        self.episode_durations = []
+        self.title = title
+        self.render_each_episode = render_each_episode
+
+    def plot_durations(self, average_window=5):
+        # plt.figure(2)
+        plt.figure()
+        # plt.clf()
+        durations_t = np.array(self.episode_durations, dtype=np.float32)
+        plt.title('Training...')
+        plt.xlabel('Episode')
+        plt.ylabel('Duration')
+        plt.title(self.title, fontsize=12)
+
+        episodes = [i + 1 for i in range(len(self.episode_durations))]
+        plt.plot(episodes, durations_t)
+        # plot episode running averages
+        if len(durations_t) >= average_window:
+            means = running_mean(durations_t, average_window)
+            means = np.concatenate((np.zeros(average_window - 1), means))
+            plt.plot(episodes, means)
+
+        # display.display(plt.gcf())
+        plt.show()
+
+    def episode_done(self, length):
+        self.episode_durations.append(length)
+        if self.render_each_episode:
+            self.plot_durations()
+
+    def plot_end(self):
+        self.plot_durations()
+        plt.ioff()  # type: ignore
+        # plt.show()
+
+
+def length_of_all_episodes(run):
+    """Get the length of every episode"""
+    return [len(e) for e in run['all_episodes_rewards']]
+
+
+def reduce(x, desired_width):
+    return [np.average(c) for c in np.array_split(x, desired_width)]
+
+
+def episodes_rewards_averaged(run):
+    """Plot cumulative rewards for a given set of specified episodes"""
+    max_iteration_count = np.max([len(r) for r in run['all_episodes_rewards']])
+    all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in run['all_episodes_rewards']]
+    cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1)
+    avg = np.average(cumrewards, axis=0)
+    return list(avg)
+
+
+def episodes_lengths_for_all_runs(all_runs):
+    return [length_of_all_episodes(run) for run in all_runs]
+
+
+def averaged_cummulative_rewards(all_runs, width):
+    return [reduce(episodes_rewards_averaged(run), width) for run in all_runs]
diff --git a/Eligibility Trace agents/Watkins Q/.DS_Store b/Eligibility Trace agents/Watkins Q/.DS_Store
new file mode 100644
index 00000000..77ff7bfb
Binary files /dev/null and b/Eligibility Trace agents/Watkins Q/.DS_Store differ
diff --git a/Eligibility Trace agents/Watkins Q/agent_watkins_q.py b/Eligibility Trace agents/Watkins Q/agent_watkins_q.py
new file mode 100644
index 00000000..061493fa
--- /dev/null
+++ b/Eligibility Trace agents/Watkins Q/agent_watkins_q.py	
@@ -0,0 +1,303 @@
+import random
+from typing import NamedTuple, Optional, Tuple, Union, List
+import numpy as np
+from numpy import ndarray
+import logging
+import boolean
+
+from cyberbattle._env import cyberbattle_env
+from agent_wrapper import EnvironmentBounds, discovered_nodes_notowned
+from gym import spaces, Wrapper
+
+import agent_wrapper as w
+from learner import Learner
+
+from torch import Tensor
+import torch
+
+class StateActionModel:
+
+    def __init__(self, ep: EnvironmentBounds):
+        self.ep = ep
+
+        self.global_features = w.ConcatFeatures(ep, [
+            w.Feature_discovered_not_owned_nodes_sliding(ep),
+            w.Feature_discovered_credential_count(ep)
+        ])
+
+        self.source_node_features = w.ConcatFeatures(ep, [
+            w.Feature_active_node_properties(ep),
+            w.Feature_success_actions_at_node(ep)
+        ])
+
+        self.target_node_features = w.ConcatFeatures(ep, [
+            w.Feature_active_node_id(ep)
+        ])
+
+        self.state_space = w.ConcatFeatures(ep, self.global_features.feature_selection +
+                                            self.source_node_features.feature_selection +
+                                            self.target_node_features.feature_selection)
+
+        self.action_space = w.AbstractAction(ep)
+
+    def valid_actions(self, wrapped_env: w.AgentWrapper, observation):
+        """returns a list of valid actions and the nodes they can be carried out from"""
+
+        nodes_and_actions = []
+        discovered_nodes = np.union1d(w.owned_nodes(observation), w.discovered_nodes_notowned(observation))
+
+        for from_node in w.owned_nodes(observation):
+            for local_action in range(self.action_space.n_local_actions):
+                trial_action = self.action_space.abstract_to_gymaction(from_node, observation, local_action, None)
+                if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                    nodes_and_actions.append((from_node, local_action, -1))
+
+            for remote_action in range(self.action_space.n_local_actions, self.action_space.n_local_actions + self.action_space.n_remote_actions):
+                for target_node in discovered_nodes:
+                    if target_node != from_node:
+                        trial_action = self.action_space.abstract_to_gymaction(from_node, observation, remote_action, target_node)
+                        if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                            nodes_and_actions.append((from_node, remote_action, target_node))
+
+            for connect_action in range(self.action_space.n_local_actions + self.action_space.n_remote_actions, self.action_space.n_actions):
+                trial_action = self.action_space.abstract_to_gymaction(from_node, observation, connect_action, None)
+                if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']):
+                    nodes_and_actions.append((from_node, connect_action, -1))
+
+        return nodes_and_actions
+
+class Memory:
+
+    def __init__(self, ep:EnvironmentBounds, hash_size):
+        self.hash_size = hash_size
+
+        self.memory = torch.zeros([2, hash_size], dtype=torch.float64)
+
+    def state_action_index(self, state_space, abstract_action):
+        """Turns a state action pair into an index for the memory tensor"""
+        feature_vector = np.append(state_space, abstract_action)
+        hash_number = abs(hash(str(feature_vector)))
+
+        return hash_number % self.hash_size
+        
+class ChosenActionMetadata(NamedTuple):
+    
+    abstract_action: np.int32
+    actor_node: int
+    actor_features: ndarray
+    actor_state: ndarray
+
+    def __repr__(self) -> str:
+        return f"[abstract_action={self.abstract_action}, actor={self.actor_node}, state={self.actor_state}]"
+
+class WatkinsQPolicy(Learner):
+
+    def __init__(self,
+                 ep: EnvironmentBounds,
+                 gamma: float,
+                 λ: float,
+                 learning_rate: float,
+                 hash_size: int
+                 ):
+        
+        self.model = StateActionModel(ep)
+        self.n_local_actions = ep.local_attacks_count
+        self.n_remote_actions = ep.remote_attacks_count
+        self.n_actions = self.n_local_actions + self.n_remote_actions + ep.port_count
+        self.gamma = gamma
+        self.λ = λ
+        self.learning_rate = learning_rate
+        self.hash_size = hash_size
+
+        self.memory = Memory(ep, hash_size)
+
+    def parameters_as_string(self):
+        return f'γ={self.gamma}, lr={self.learning_rate}, λ={self.λ},\n'
+
+    def all_parameters_as_string(self) -> str:
+        model = self.model
+        return f'{self.parameters_as_string()}\n' \
+            f'dimension={model.state_space.flat_size()}x{model.action_space.flat_size()}, ' \
+            f'Q={[f.name() for f in model.state_space.feature_selection]} ' \
+            f"-> 'abstract_action'"
+
+    def get_actor_state_vector(self, global_state: ndarray, actor_features: ndarray, target_features: Optional[ndarray]) -> ndarray:
+        """Turns seperate state features into one vector"""
+        if target_features is None:
+            return np.concatenate((np.array(global_state, dtype=np.float32),
+                                np.array(actor_features, dtype=np.float32)))
+        else:
+            return np.concatenate((np.array(global_state, dtype=np.float32),
+                                np.array(actor_features, dtype=np.float32),
+                                np.array(target_features, dtype=np.float32)))
+
+    def update_memory(self, 
+                    reward: float,
+                    actor_state: ndarray,
+                    abstract_action: int,
+                    next_actor_state: Optional[ndarray],
+                    next_abstract_action: Optional[int],
+                    chosen_action_is_max = boolean):
+        
+        current_state_action_index = self.memory.state_action_index(actor_state, abstract_action)
+        if next_actor_state is None:
+            δ = reward - self.memory.memory[0][current_state_action_index].item()
+        else:
+            next_state_action_index = self.memory.state_action_index(next_actor_state, next_abstract_action)
+            δ = reward + (self.gamma * self.memory.memory[0][next_state_action_index].item()) - self.memory.memory[0][current_state_action_index].item()
+
+        self.memory.memory[1][current_state_action_index] +=  1
+
+        non_zero_indicies_q = torch.argwhere(self.memory.memory[0]).numpy()
+        non_zero_indicies_e = torch.argwhere(self.memory.memory[1]).numpy()
+        non_zero_indicies = np.union1d(non_zero_indicies_q, non_zero_indicies_e)
+
+        for i in non_zero_indicies:
+
+            self.memory.memory[0][i] = self.memory.memory[0][i].item() + float(self.learning_rate * δ * self.memory.memory[1][i].item())
+            self.memory.memory[0][i] = round(self.memory.memory[0][i].item(), 5)
+            self.memory.memory[0][i] = max(0, self.memory.memory[0][i].item())
+            self.memory.memory[0][i] = min(100, self.memory.memory[0][i].item())
+
+            if chosen_action_is_max:
+                self.memory.memory[1][i] = self.memory.memory[1][i].item() * float(self.gamma * self.λ)
+                self.memory.memory[1][i] = round(self.memory.memory[0][i].item(), 5)
+            else:
+                self.memory.memory[1][i] = 0
+
+    def on_step(self, wrapped_env: w.AgentWrapper,
+                observation, reward: float, done: bool, action_metadata, epsilon):
+
+        if done:
+            self.update_memory(reward,
+                            actor_state=action_metadata.actor_state,
+                            abstract_action=action_metadata.abstract_action,
+                            next_actor_state=None,
+                            next_abstract_action=None,
+                            chosen_action_is_max=False           
+                            )
+        else:
+            x = np.random.rand()
+            if x <= epsilon:
+                _, _, chosen_next_action_metadata = self.explore(wrapped_env)
+            else:
+                _, _, chosen_next_action_metadata = self.exploit(wrapped_env, observation)
+
+            chosen_action_pair = ((list(chosen_next_action_metadata.actor_state), chosen_next_action_metadata.abstract_action))
+            max_action_pairs = self.max_action_in_state(wrapped_env, observation)
+
+            if chosen_action_pair in max_action_pairs:
+                next_action_pair = chosen_action_pair
+                chosen_action_is_max = True
+            else:
+                next_action_pair = random.choice(max_action_pairs)
+                chosen_action_is_max = False
+
+            self.update_memory(reward,
+                            actor_state=action_metadata.actor_state,
+                            abstract_action=action_metadata.abstract_action,
+                            next_actor_state=next_action_pair[0],
+                            next_abstract_action=next_action_pair[1],
+                            chosen_action_is_max=chosen_action_is_max
+                            )
+        
+
+    def new_episode(self):
+        torch.mul(self.memory.memory[1], 0)
+
+    def end_of_episode(self, i_episode, t):
+        return None
+
+    def end_of_iteration(self, t, done):
+        return None
+
+    def metadata_from_gymaction(self, wrapped_env, gym_action):
+        """Takes in a gym action and returns it's metadata"""
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        actor_node = cyberbattle_env.sourcenode_of_action(gym_action)
+        actor_features = self.model.source_node_features.get(wrapped_env.state, actor_node)
+        abstract_action = self.model.action_space.abstract_from_gymaction(gym_action)
+
+        if 'remote_vulnerability' in gym_action:
+            target_node = self.model.target_node_features.get(wrapped_env.state, gym_action['remote_vulnerability'][1])
+        else:
+            target_node = None
+
+        return ChosenActionMetadata(
+            abstract_action=abstract_action,
+            actor_node=actor_node,
+            actor_features=actor_features,
+            actor_state=self.get_actor_state_vector(current_global_state, actor_features, target_node))
+
+    def stateaction_as_string(self, action_metadata) -> str:
+        return ''
+
+    def explore(self, wrapped_env: w.AgentWrapper
+                ) -> Tuple[str, cyberbattle_env.Action, object]:
+
+        gym_action = wrapped_env.env.sample_valid_action(kinds=[0, 1, 2])
+        metadata = self.metadata_from_gymaction(wrapped_env, gym_action)
+        return gym_action, metadata
+
+    def exploit(self, wrapped_env: w.AgentWrapper, observation) -> Tuple[str, Optional[cyberbattle_env.Action], object]:
+
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation)
+
+        #The q_values are the estimated returns from an action taken in the current state
+        q_values = []
+        for item in valid_nodes_and_actions:
+            source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0])
+
+            if item[1] < self.n_local_actions or item[1] - self.n_local_actions > self.n_remote_actions:
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None)
+            else:
+                target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2])
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features)
+
+            action_state_index = self.memory.state_action_index(actor_state_vector, item[1])
+
+            q_values.append(self.memory.memory[0][action_state_index].item())
+
+        indicies_of_chosen_actions = [i for i, x in enumerate(q_values) if x == max(q_values)]
+        chosen_action_index = random.choice(indicies_of_chosen_actions)
+        chosen_action = valid_nodes_and_actions[chosen_action_index]
+
+        if chosen_action[1] < self.n_local_actions or chosen_action[1] - self.n_local_actions > self.n_remote_actions:
+            gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], None)
+        else:
+            gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], chosen_action[2])
+
+        metadata = self.metadata_from_gymaction(wrapped_env, gym_action)
+
+        return gym_action, metadata
+
+    def max_action_in_state(self, wrapped_env: w.AgentWrapper, observation):
+        current_global_state = self.model.global_features.get(wrapped_env.state, node=None)
+        valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation)
+
+        q_values = []
+        states_and_actions = []
+        for item in valid_nodes_and_actions:
+            source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0])
+
+            if item[1] < self.n_local_actions:
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None)
+            elif item[1] - self.n_local_actions < self.n_remote_actions:
+                target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2])
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features)
+            else:
+                actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None)
+
+            action_state_index = self.memory.state_action_index(actor_state_vector, item[1])
+
+            q_values.append(self.memory.memory[0][action_state_index].item())
+            states_and_actions.append((list(actor_state_vector), item[1]))
+
+        indicies_of_chosen_actions = [i for i, x in enumerate(q_values) if x == max(q_values)]
+        to_return = []
+        for i in range(len(states_and_actions)):
+            if i in indicies_of_chosen_actions:
+                to_return.append(states_and_actions[i])
+
+        return to_return
\ No newline at end of file
diff --git a/Eligibility Trace agents/Watkins Q/agent_wrapper.py b/Eligibility Trace agents/Watkins Q/agent_wrapper.py
new file mode 100644
index 00000000..985aa1e9
--- /dev/null
+++ b/Eligibility Trace agents/Watkins Q/agent_wrapper.py	
@@ -0,0 +1,342 @@
+from cyberbattle._env.cyberbattle_env import EnvironmentBounds
+from typing import Optional, List
+import enum
+import numpy as np
+from gym import spaces, Wrapper
+from numpy import ndarray
+import cyberbattle._env.cyberbattle_env as cyberbattle_env
+import logging
+
+
+class StateAugmentation:
+    """Default agent state augmentation, consisting of the gym environment
+    observation itself and nothing more."""
+
+    def __init__(self, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+    def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+    def on_reset(self, observation: cyberbattle_env.Observation):
+        self.observation = observation
+
+
+class Feature(spaces.MultiDiscrete):
+    """
+    Feature consisting of multiple discrete dimensions.
+    Parameters:
+        nvec: is a vector defining the number of possible values
+        for each discrete space.
+    """
+
+    def __init__(self, env_properties: EnvironmentBounds, nvec):
+        self.env_properties = env_properties
+        super().__init__(nvec)
+
+    def flat_size(self):
+        return np.prod(self.nvec)
+
+    def name(self):
+        """Return the name of the feature"""
+        p = len(type(Feature(self.env_properties, [])).__name__) + 1
+        return type(self).__name__[p:]
+
+    def get(self, a: StateAugmentation, node: Optional[int]) -> np.ndarray:
+        """Compute the current value of a feature value at
+        the current observation and specific node"""
+        raise NotImplementedError
+
+    def pretty_print(self, v):
+        return v
+
+class Feature_active_node_properties(Feature):
+    """Bitmask of all properties set for the active node"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [2] * p.property_count)
+
+    def get(self, a: StateAugmentation, node) -> ndarray:
+        assert node is not None, 'feature only valid in the context of a node'
+
+        node_prop = a.observation['discovered_nodes_properties']
+
+        # list of all properties set/unset on the node
+        # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0)
+        assert node < len(node_prop), f'invalid node index {node} (not discovered yet)'
+        remapped = np.array((1 + node_prop[node]) / 2, dtype=int)
+        return remapped
+
+
+class Feature_active_node_id(Feature):
+    """Return the node id itself"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count] * 1)
+
+    def get(self, a: StateAugmentation, node) -> ndarray:
+        return np.array([node], dtype=int)
+
+
+class Feature_discovered_credential_count(Feature):
+    """number of credentials discovered so far"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_total_credentials + 1])
+
+    def get(self, a: StateAugmentation, node):
+        return [len(a.observation['credential_cache_matrix'])]
+
+
+class Feature_discovered_not_owned_nodes_sliding(Feature):
+    """array of which of discovered nodes not owned by name"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count + 1])
+
+    def get(self, a: StateAugmentation, node):
+        discovered = a.observation['discovered_nodes']
+        levels = a.observation['nodes_privilegelevel']
+        owned_nodes_indices = np.where(levels > 0)[0]
+        owned = []
+        for i in owned_nodes_indices:
+            owned.append(discovered[i])
+        discovered_not_owned = []
+        for node in discovered:
+            if node not in owned:
+                discovered_not_owned.append(node)
+        discovered_not_owned_sliding = np.zeros(self.env_properties.maximum_node_count, np.int32)
+        for node_id in discovered_not_owned:
+            if node_id == 'client':
+                discovered_not_owned_sliding[0] = 1
+            elif node_id == 'Website':
+                discovered_not_owned_sliding[1] = 1
+            elif node_id == 'Website.Directory':
+                discovered_not_owned_sliding[2] = 1
+            elif node_id == 'Website[user=monitor]':
+                discovered_not_owned_sliding[3] = 1
+            elif node_id == 'GitHubProject':
+                discovered_not_owned_sliding[4] = 1
+            elif node_id == 'AzureStorage':
+                discovered_not_owned_sliding[5] = 1
+            elif node_id == 'Sharepoint':
+                discovered_not_owned_sliding[6] = 1
+            elif node_id == 'AzureResourceManager':
+                discovered_not_owned_sliding[7] = 1
+            elif node_id == 'AzureResourceManager[user-monitor]':
+                discovered_not_owned_sliding[8] = 1
+            elif node_id == 'AzureVM':
+                discovered_not_owned_sliding[9] = 1
+        return discovered_not_owned_sliding
+        
+class Feature_active_node_id(Feature):
+    """number asigned to each type of node in toy-ctf"""
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [p.maximum_node_count + 1])
+
+    def get(self, a: StateAugmentation, node):
+        node_id = a.observation['discovered_nodes'][node]
+        
+        node_id_array = np.zeros(1, np.int32)
+        if node_id == 'client':
+            node_id_array[0] = 0
+        elif node_id == 'Website':
+            node_id_array[0] = 1
+        elif node_id == 'Website.Directory':
+            node_id_array[0] = 2
+        elif node_id == 'Website[user=monitor]':
+            node_id_array[0] = 3
+        elif node_id == 'GitHubProject':
+            node_id_array[0] = 4
+        elif node_id == 'AzureStorage':
+            node_id_array[0] = 5
+        elif node_id == 'Sharepoint':
+            node_id_array[0] = 6
+        elif node_id == 'AzureResourceManager':
+            node_id_array[0] = 7
+        elif node_id == 'AzureResourceManager[user-monitor]':
+            node_id_array[0] = 8
+        elif node_id == 'AzureVM':
+            node_id_array[0] = 9
+        else:
+            node_id_array[0] = 10
+        return node_id_array
+
+
+class ConcatFeatures(Feature):
+    """ Concatenate a list of features into a single feature
+    Parameters:
+        feature_selection - a selection of features to combine
+    """
+
+    def __init__(self, p: EnvironmentBounds, feature_selection: List[Feature]):
+        self.feature_selection = feature_selection
+        self.dim_sizes = np.concatenate([f.nvec for f in feature_selection])
+        super().__init__(p, [self.dim_sizes])
+
+    def pretty_print(self, v):
+        return v
+
+    def get(self, a: StateAugmentation, node=None) -> np.ndarray:
+        """Return the feature vector"""
+        feature_vector = [f.get(a, node) for f in self.feature_selection]
+        return np.concatenate(feature_vector)
+
+
+def owned_nodes(observation):
+    """Return the list of owned nodes"""
+    return np.nonzero(observation['nodes_privilegelevel'])[0]
+
+
+def discovered_nodes_notowned(observation):
+    """Return the list of discovered nodes that are not owned yet"""
+    return np.nonzero(observation['nodes_privilegelevel'] == 0)[0]
+
+
+class AbstractAction(Feature):
+    """An abstraction of the gym state space that reduces
+    the space dimension for learning use to just
+        - local_attack(vulnid)    (source_node provided)
+        - remote_attack(vulnid)   (source_node provided, target_node forgotten)
+        - connect(port)           (source_node provided, target_node forgotten, credentials infered from cache)
+    """
+
+    def __init__(self, p: EnvironmentBounds):
+        self.n_local_actions = p.local_attacks_count
+        self.n_remote_actions = p.remote_attacks_count
+        self.n_connect_actions = p.port_count
+        self.n_actions = self.n_local_actions + self.n_remote_actions + self.n_connect_actions
+        super().__init__(p, [self.n_actions])
+
+    def abstract_to_gymaction(self, source_node, observation, abstract_action, target_node):
+        """Takes a statring node and an abstract action number and returns a gym action"""
+
+        if abstract_action < self.n_local_actions:
+            vuln = abstract_action
+            return {'local_vulnerability': np.array([source_node, vuln])}
+
+        node_prop = observation['discovered_nodes_properties']
+        abstract_action -= self.n_local_actions
+        if abstract_action < self.n_remote_actions:
+            vuln = abstract_action
+
+            discovered_nodes_count = len(node_prop)
+            if discovered_nodes_count <= 1:
+                return None
+
+            return {'remote_vulnerability': np.array([source_node, target_node, vuln])}
+
+        abstract_action -= self.n_remote_actions
+        port = np.int32(abstract_action)
+
+        discovered_credentials = np.array(observation['credential_cache_matrix'])
+        n_discovered_creds = len(discovered_credentials)
+        if n_discovered_creds <= 0:
+            return None
+
+        nodes_not_owned = discovered_nodes_notowned(observation)
+        match_port = discovered_credentials[:, 1] == port
+        match_port_indicies = np.where(match_port)[0]
+
+        credential_indices_choices = [c for c in match_port_indicies
+                                        if discovered_credentials[c, 0] in nodes_not_owned]
+
+        if credential_indices_choices:
+            logging.debug('found matching cred in the credential cache')
+        else:
+            logging.debug('no cred matching requested port, trying instead creds used to access other ports')
+            credential_indices_choices = [i for (i, n) in enumerate(discovered_credentials[:, 0])
+                                          if n in nodes_not_owned]
+
+            if credential_indices_choices:
+                logging.debug('found cred in the credential cache without matching port name')
+            else:
+                logging.debug('no cred to use from the credential cache')
+                return None
+
+        cred = np.int32(np.random.choice(credential_indices_choices))
+        target = np.int32(discovered_credentials[cred, 0])
+        return {'connect': np.array([source_node, target, port, cred], dtype=np.int32)}
+
+    def abstract_from_gymaction(self, gym_action: cyberbattle_env.Action) -> np.int32:
+        """Turns a gym action into it's abstract action number"""
+        if 'local_vulnerability' in gym_action:
+            return gym_action['local_vulnerability'][1]
+        elif 'remote_vulnerability' in gym_action:
+            r = gym_action['remote_vulnerability']
+            return self.n_local_actions + r[2]
+
+        assert 'connect' in gym_action
+        c = gym_action['connect']
+
+        a = self.n_local_actions + self.n_remote_actions + c[2]
+        assert a < self.n_actions
+        return np.int32(a)
+
+
+class ActionTrackingStateAugmentation(StateAugmentation):
+    """An agent state augmentation consisting of
+    the environment observation augmented with the following dynamic information:
+       - success_action_count: count of action taken and succeeded at the current node
+       - failed_action_count: count of action taken and failed at the current node
+     """
+
+    def __init__(self, p: EnvironmentBounds, observation: cyberbattle_env.Observation):
+        self.aa = AbstractAction(p)
+        self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.env_properties = p
+        super().__init__(observation)
+
+    def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation):
+        node = cyberbattle_env.sourcenode_of_action(action)
+        abstract_action = self.aa.abstract_from_gymaction(action)
+        if reward > 0:
+            self.success_action_count[node, abstract_action] += 1
+        else:
+            self.failed_action_count[node, abstract_action] += 1
+        super().on_step(action, reward, done, observation)
+
+    def on_reset(self, observation: cyberbattle_env.Observation):
+        p = self.env_properties
+        self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32)
+        super().on_reset(observation)
+
+
+class Feature_success_actions_at_node(Feature):
+    """number of time each action succeeded at a given node"""
+
+    max_action_count = 100
+
+    def __init__(self, p: EnvironmentBounds):
+        super().__init__(p, [self.max_action_count] * AbstractAction(p).n_actions)
+
+    def get(self, a: ActionTrackingStateAugmentation, node: int):
+        return np.minimum(a.success_action_count[node, :], self.max_action_count - 1)
+
+
+class Verbosity(enum.Enum):
+    """Verbosity of the learning function"""
+    Quiet = 0
+    Normal = 1
+    Verbose = 2
+
+
+class AgentWrapper(Wrapper):
+    """Gym wrapper to update the agent state on every step"""
+
+    def __init__(self, env: cyberbattle_env.CyberBattleEnv, state: StateAugmentation):
+        super().__init__(env)
+        self.state = state
+
+    def step(self, action: cyberbattle_env.Action):
+        observation, reward, done, info = self.env.step(action)
+        self.state.on_step(action, reward, done, observation)
+        return observation, reward, done, info
+
+    def reset(self):
+        observation = self.env.reset()
+        self.state.on_reset(observation)
+        return observation
\ No newline at end of file
diff --git a/Eligibility Trace agents/Watkins Q/learner.py b/Eligibility Trace agents/Watkins Q/learner.py
new file mode 100644
index 00000000..afaefd1f
--- /dev/null
+++ b/Eligibility Trace agents/Watkins Q/learner.py	
@@ -0,0 +1,270 @@
+import math
+import sys
+
+from plotting import PlotTraining, plot_averaged_cummulative_rewards
+from agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation
+import logging
+import numpy as np
+from cyberbattle._env import cyberbattle_env
+from typing import Tuple, Optional, TypedDict, List
+import progressbar
+import abc
+
+class Agent(abc.ABC):
+
+    @abc.abstractmethod
+    def explore(self, wrapped_env: AgentWrapper) -> Tuple[cyberbattle_env.Action, object]:
+        """Exploration function.
+        Returns (action_type, gym_action, action_metadata) where
+        action_metadata is a custom object that gets passed to the on_step callback function"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[Optional[cyberbattle_env.Action], object]:
+        """Exploit function.
+        Returns (action_type, gym_action, action_metadata) where
+        action_metadata is a custom object that gets passed to the on_step callback function"""
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def on_step(self, wrapped_env: AgentWrapper, observation, reward, done, action_metadata, epsilon) -> None:
+        raise NotImplementedError
+
+    def parameters_as_string(self) -> str:
+        return ''
+
+    def all_parameters_as_string(self) -> str:
+        return ''
+
+    def loss_as_string(self) -> str:
+        return ''
+
+    def stateaction_as_string(self, action_metadata) -> str:
+        return ''
+
+Breakdown = TypedDict('Breakdown', {
+    'local': int,
+    'remote': int,
+    'connect': int
+})
+
+Outcomes = TypedDict('Outcomes', {
+    'reward': Breakdown,
+    'noreward': Breakdown
+})
+
+Stats = TypedDict('Stats', {
+    'exploit': Outcomes,
+    'explore': Outcomes
+})
+
+TrainedAgent = TypedDict('TrainedAgent', {
+    'all_episodes_rewards': List[List[float]],
+    'all_episodes_availability': List[List[float]],
+    'agent': Agent,
+    'trained_on': str,
+    'title': str
+})
+
+def print_stats(stats):
+    """Print learning statistics"""
+    def print_breakdown(stats, actiontype: str):
+        def ratio(kind: str) -> str:
+            x, y = stats[actiontype]['reward'][kind], stats[actiontype]['noreward'][kind]
+            sum = x + y
+            if sum == 0:
+                return 'NaN'
+            else:
+                return f"{(x / sum):.2f}"
+
+        def print_kind(kind: str):
+            print(
+                f"    {actiontype}-{kind}: {stats[actiontype]['reward'][kind]}/{stats[actiontype]['noreward'][kind]} "
+                f"({ratio(kind)})")
+        print_kind('local')
+        print_kind('remote')
+        print_kind('connect')
+
+    print("  Breakdown [Reward/NoReward (Success rate)]")
+    print_breakdown(stats, 'explore')
+    print_breakdown(stats, 'exploit')
+
+def epsilon_greedy_search(
+    cyberbattle_gym_env: cyberbattle_env.CyberBattleEnv,
+    environment_properties: EnvironmentBounds,
+    agent: Agent,
+    title: str,
+    episode_count: int,
+    iteration_count: int,
+    epsilon: float,
+    epsilon_minimum=0.0,
+    epsilon_multdecay: Optional[float] = None,
+    epsilon_exponential_decay: Optional[int] = None,
+    render=True,
+    render_last_episode_rewards_to: Optional[str] = None,
+    verbosity: Verbosity = Verbosity.Normal,
+    plot_episodes_length=True
+) -> TrainedAgent:
+
+    print(f"###### {title}\n"
+          f"Learning with: episode_count={episode_count},"
+          f"iteration_count={iteration_count},"
+          f"ϵ={epsilon},"
+          f'ϵ_min={epsilon_minimum}, '
+          + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '')
+          + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') +
+          f"{agent.parameters_as_string()}")
+
+    initial_epsilon = epsilon
+
+    all_episodes_rewards = []
+    all_episodes_availability = []
+
+    wrapped_env = AgentWrapper(cyberbattle_gym_env,
+                               ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset()))
+
+    steps_done = 0
+    plot_title = f"{title} (epochs={episode_count}, ϵ={initial_epsilon}, ϵ_min={epsilon_minimum}," \
+        + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') \
+        + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') \
+        + agent.parameters_as_string()
+    plottraining = PlotTraining(title=plot_title, render_each_episode=render)
+
+    render_file_index = 1
+
+    for i_episode in range(1, episode_count + 1):
+
+        print(f"  ## Episode: {i_episode}/{episode_count} '{title}' "
+              f"ϵ={epsilon:.4f}, "
+              f"{agent.parameters_as_string()}")
+
+        observation = wrapped_env.reset()
+        total_reward = 0.0
+        all_rewards = []
+        all_availability = []
+        agent.new_episode()
+
+        stats = Stats(exploit=Outcomes(reward=Breakdown(local=0, remote=0, connect=0),
+                                       noreward=Breakdown(local=0, remote=0, connect=0)),
+                      explore=Outcomes(reward=Breakdown(local=0, remote=0, connect=0),
+                                       noreward=Breakdown(local=0, remote=0, connect=0))
+                      )
+
+        episode_ended_at = None
+        sys.stdout.flush()
+
+        bar = progressbar.ProgressBar(
+            widgets=[
+                'Episode ',
+                f'{i_episode}',
+                '|Iteration ',
+                progressbar.Counter(),
+                '|',
+                progressbar.Variable(name='reward', width=6, precision=10),
+                '|',
+                progressbar.Variable(name='last_reward_at', width=4),
+                '|',
+                progressbar.Timer(),
+                progressbar.Bar()
+            ],
+            redirect_stdout=False)
+
+        for t in bar(range(1, 1 + iteration_count)):
+
+            if epsilon_exponential_decay:
+                epsilon = epsilon_minimum + math.exp(-1. * steps_done /
+                                                     epsilon_exponential_decay) * (initial_epsilon - epsilon_minimum)
+
+            steps_done += 1
+
+            x = np.random.rand()
+            if x <= epsilon:
+                gym_action, action_metadata = agent.explore(wrapped_env)
+                action_style = "explore"
+            else:
+                gym_action, action_metadata = agent.exploit(wrapped_env, observation)
+                action_style = "exploit"
+
+            # Take the step
+            logging.debug(f"gym_action={gym_action}, action_metadata={action_metadata}")
+            observation, reward, done, info = wrapped_env.step(gym_action)
+
+            action_type = 'exploit' if action_style == 'exploit' else 'explore'
+            outcome = 'reward' if reward > 0 else 'noreward'
+            if 'local_vulnerability' in gym_action:
+                stats[action_type][outcome]['local'] += 1
+            elif 'remote_vulnerability' in gym_action:
+                stats[action_type][outcome]['remote'] += 1
+            else:
+                stats[action_type][outcome]['connect'] += 1
+
+            agent.on_step(wrapped_env, observation, reward, done, action_metadata, epsilon)
+            assert np.shape(reward) == ()
+
+            all_rewards.append(reward)
+            all_availability.append(info['network_availability'])
+            total_reward += reward
+            bar.update(t, reward=total_reward)
+            if reward > 0:
+                bar.update(t, last_reward_at=t)
+
+            if verbosity == Verbosity.Verbose or (verbosity == Verbosity.Normal and reward > 0):
+                sign = ['-', '+'][reward > 0]
+
+                print(f"    {sign} t={t} {action_style} r={reward} cum_reward:{total_reward} "
+                      f"a={action_metadata}-{gym_action} "
+                      f"creds={len(observation['credential_cache_matrix'])} "
+                      f" {agent.stateaction_as_string(action_metadata)}")
+
+            if i_episode == episode_count \
+                    and render_last_episode_rewards_to is not None \
+                    and reward > 0:
+                fig = cyberbattle_gym_env.render_as_fig()
+                fig.write_image(f"{render_last_episode_rewards_to}-e{i_episode}-{render_file_index}.png")
+                render_file_index += 1
+
+            agent.end_of_iteration(t, done)
+
+            if done:
+                episode_ended_at = t
+                bar.finish(dirty=True)
+                break
+
+        sys.stdout.flush()
+
+        loss_string = agent.loss_as_string()
+        if loss_string:
+            loss_string = "loss={loss_string}"
+
+        if episode_ended_at:
+            print(f"  Episode {i_episode} ended at t={episode_ended_at} {loss_string}")
+        else:
+            print(f"  Episode {i_episode} stopped at t={iteration_count} {loss_string}")
+
+        print_stats(stats)
+
+        all_episodes_rewards.append(all_rewards)
+        all_episodes_availability.append(all_availability)
+
+        length = episode_ended_at if episode_ended_at else iteration_count
+        agent.end_of_episode(i_episode=i_episode, t=length)
+        if plot_episodes_length:
+            plottraining.episode_done(length)
+        if render:
+            wrapped_env.render()
+
+        if epsilon_multdecay:
+            epsilon = max(epsilon_minimum, epsilon * epsilon_multdecay)
+
+    wrapped_env.close()
+    print("simulation ended")
+    if plot_episodes_length:
+        plottraining.plot_end()
+
+    return TrainedAgent(
+        all_episodes_rewards=all_episodes_rewards,
+        all_episodes_availability=all_episodes_availability,
+        agent=agent,
+        trained_on=cyberbattle_gym_env.name,
+        title=plot_title
+    )
\ No newline at end of file
diff --git a/Eligibility Trace agents/Watkins Q/plotting.py b/Eligibility Trace agents/Watkins Q/plotting.py
new file mode 100644
index 00000000..e51b4a55
--- /dev/null
+++ b/Eligibility Trace agents/Watkins Q/plotting.py	
@@ -0,0 +1,203 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Plotting helpers for agent banchmarking"""
+
+import matplotlib.pyplot as plt  # type:ignore
+import numpy as np
+
+
+def new_plot(title):
+    """Prepare a new plot of cumulative rewards"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('cumulative reward', fontsize=20)
+    plt.xlabel('step', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title(title, fontsize=12)
+
+
+def pad(array, length):
+    """Pad an array with 0s to make it of desired length"""
+    padding = np.zeros((length,))
+    padding[:len(array)] = array
+    return padding
+
+
+def plot_episodes_rewards_averaged(results):
+    """Plot cumulative rewards for a given set of specified episodes"""
+    max_iteration_count = np.max([len(r) for r in results['all_episodes_rewards']])
+
+    all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in results['all_episodes_rewards']]
+    cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1)
+    avg = np.average(cumrewards, axis=0)
+    std = np.std(cumrewards, axis=0)
+    x = [i for i in range(len(std))]
+    plt.plot(x, avg, label=results['title'])
+    plt.fill_between(x, avg - std, avg + std, alpha=0.5)
+
+
+def fill_with_latest_value(array, length):
+    pad = length - len(array)
+    if pad > 0:
+        return np.pad(array, (0, pad), mode='edge')
+    else:
+        return array
+
+
+def plot_episodes_availability_averaged(results):
+    """Plot availability for a given set of specified episodes"""
+    data = results['all_episodes_availability']
+    longest_episode_length = np.max([len(r) for r in data])
+
+    all_episodes_padded = [fill_with_latest_value(av, longest_episode_length) for av in data]
+    avg = np.average(all_episodes_padded, axis=0)
+    std = np.std(all_episodes_padded, axis=0)
+    x = [i for i in range(len(std))]
+    plt.plot(x, avg, label=results['title'])
+    plt.fill_between(x, avg - std, avg + std, alpha=0.5)
+
+
+def plot_episodes_length(learning_results):
+    """Plot length of every episode"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('#iterations', fontsize=20)
+    plt.xlabel('episode', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title("Length of each episode", fontsize=12)
+
+    for results in learning_results:
+        iterations = [len(e) for e in results['all_episodes_rewards']]
+        episode = [i for i in range(len(results['all_episodes_rewards']))]
+        plt.plot(episode, iterations, label=f"{results['title']}")
+
+    plt.legend(loc="upper right")
+    plt.show()
+
+
+def plot_each_episode(results):
+    """Plot cumulative rewards for each episode"""
+    for i, episode in enumerate(results['all_episodes_rewards']):
+        cumrewards = np.cumsum(episode)
+        x = [i for i in range(len(cumrewards))]
+        plt.plot(x, cumrewards, label=f'Episode {i}')
+
+
+def plot_all_episodes(r):
+    """Plot cumulative rewards for every episode"""
+    new_plot(r['title'])
+    plot_each_episode(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def plot_averaged_cummulative_rewards(title, all_runs):
+    """Plot averaged cumulative rewards"""
+    new_plot(title)
+    for r in all_runs:
+        plot_episodes_rewards_averaged(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def plot_averaged_availability(title, all_runs):
+    """Plot averaged network availability"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('network availability', fontsize=20)
+    plt.xlabel('step', fontsize=20)
+    plt.xticks(size=20)
+    plt.yticks(size=20)
+    plt.title(title, fontsize=12)
+    for r in all_runs:
+        plot_episodes_availability_averaged(r)
+    plt.legend(loc="lower right")
+    plt.show()
+
+
+def new_plot_loss():
+    """Plot MSE loss averaged over all episodes"""
+    plt.figure(figsize=(10, 8))
+    plt.ylabel('loss', fontsize=20)
+    plt.xlabel('episodes', fontsize=20)
+    plt.xticks(size=12)
+    plt.yticks(size=20)
+    plt.title("Loss", fontsize=12)
+
+
+def plot_all_episodes_loss(all_episodes_losses, name, label):
+    """Plot loss for one learning episode"""
+    x = [i for i in range(len(all_episodes_losses))]
+    plt.plot(x, all_episodes_losses, label=f'{name} {label}')
+
+
+def running_mean(x, size):
+    """return moving average of x for a window of lenght 'size'"""
+    cumsum = np.cumsum(np.insert(x, 0, 0))
+    return (cumsum[size:] - cumsum[:-size]) / float(size)
+
+
+class PlotTraining:
+    """Plot training-related stats"""
+
+    def __init__(self, title, render_each_episode):
+        self.episode_durations = []
+        self.title = title
+        self.render_each_episode = render_each_episode
+
+    def plot_durations(self, average_window=5):
+        # plt.figure(2)
+        plt.figure()
+        # plt.clf()
+        durations_t = np.array(self.episode_durations, dtype=np.float32)
+        plt.title('Training...')
+        plt.xlabel('Episode')
+        plt.ylabel('Duration')
+        plt.title(self.title, fontsize=12)
+
+        episodes = [i + 1 for i in range(len(self.episode_durations))]
+        plt.plot(episodes, durations_t)
+        # plot episode running averages
+        if len(durations_t) >= average_window:
+            means = running_mean(durations_t, average_window)
+            means = np.concatenate((np.zeros(average_window - 1), means))
+            plt.plot(episodes, means)
+
+        # display.display(plt.gcf())
+        plt.show()
+
+    def episode_done(self, length):
+        self.episode_durations.append(length)
+        if self.render_each_episode:
+            self.plot_durations()
+
+    def plot_end(self):
+        self.plot_durations()
+        plt.ioff()  # type: ignore
+        # plt.show()
+
+
+def length_of_all_episodes(run):
+    """Get the length of every episode"""
+    return [len(e) for e in run['all_episodes_rewards']]
+
+
+def reduce(x, desired_width):
+    return [np.average(c) for c in np.array_split(x, desired_width)]
+
+
+def episodes_rewards_averaged(run):
+    """Plot cumulative rewards for a given set of specified episodes"""
+    max_iteration_count = np.max([len(r) for r in run['all_episodes_rewards']])
+    all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in run['all_episodes_rewards']]
+    cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1)
+    avg = np.average(cumrewards, axis=0)
+    return list(avg)
+
+
+def episodes_lengths_for_all_runs(all_runs):
+    return [length_of_all_episodes(run) for run in all_runs]
+
+
+def averaged_cummulative_rewards(all_runs, width):
+    return [reduce(episodes_rewards_averaged(run), width) for run in all_runs]