diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 00000000..8f5ca8cf Binary files /dev/null and b/.DS_Store differ diff --git a/Eligibility Trace agents/.DS_Store b/Eligibility Trace agents/.DS_Store new file mode 100644 index 00000000..e63abd15 Binary files /dev/null and b/Eligibility Trace agents/.DS_Store differ diff --git a/Eligibility Trace agents/Actor-Critic/.DS_Store b/Eligibility Trace agents/Actor-Critic/.DS_Store new file mode 100644 index 00000000..8efb5831 Binary files /dev/null and b/Eligibility Trace agents/Actor-Critic/.DS_Store differ diff --git a/Eligibility Trace agents/Actor-Critic/agent_actor_critic.py b/Eligibility Trace agents/Actor-Critic/agent_actor_critic.py new file mode 100644 index 00000000..d9d9def4 --- /dev/null +++ b/Eligibility Trace agents/Actor-Critic/agent_actor_critic.py @@ -0,0 +1,271 @@ +import math +import random +from typing import NamedTuple, Optional, Tuple +import numpy as np +from numpy import ndarray +import logging + +from cyberbattle._env import cyberbattle_env +from agent_wrapper import EnvironmentBounds + +import agent_wrapper as w +from learner import Learner + +import torch + +class StateActionModel: + """How the state is modelled in the enviroment""" + + def __init__(self, ep: EnvironmentBounds): + self.ep = ep + + self.global_features = w.ConcatFeatures(ep, [ + w.Feature_discovered_not_owned_nodes_sliding(ep), + w.Feature_discovered_credential_count(ep) + ]) + + self.source_node_features = w.ConcatFeatures(ep, [ + w.Feature_active_node_properties(ep), + w.Feature_success_actions_at_node(ep) + ]) + + self.target_node_features = w.ConcatFeatures(ep, [ + w.Feature_active_node_id(ep) + ]) + + self.state_space = w.ConcatFeatures(ep, self.global_features.feature_selection + + self.source_node_features.feature_selection + + self.target_node_features.feature_selection) + + self.action_space = w.AbstractAction(ep) + + def valid_actions(self, wrapped_env: w.AgentWrapper, observation): + """returns a list of valid actions and the nodes they can be carried out from""" + + nodes_and_actions = [] + discovered_nodes = np.union1d(w.owned_nodes(observation), w.discovered_nodes_notowned(observation)) + + for from_node in w.owned_nodes(observation): + for local_action in range(self.action_space.n_local_actions): + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, local_action, None) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, local_action, -1)) + + for remote_action in range(self.action_space.n_local_actions, self.action_space.n_local_actions + self.action_space.n_remote_actions): + for target_node in discovered_nodes: + if target_node != from_node: + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, remote_action, target_node) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, remote_action, target_node)) + + for connect_action in range(self.action_space.n_local_actions + self.action_space.n_remote_actions, self.action_space.n_actions): + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, connect_action, None) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, connect_action, -1)) + + return nodes_and_actions + +class Memory: + """The memory structure that stores the critic value function and the actors state action policy""" + + def __init__(self, ep:EnvironmentBounds, hash_size): + self.hash_size = hash_size + + self.actor = torch.zeros([2, hash_size], dtype=torch.float64) + + self.critic = torch.zeros([2, hash_size], dtype=torch.float64) + + def state_action_index(self, state_space, abstract_action): + """Turns a state action pair into an index for the actor tensor""" + feature_vector = np.append(state_space, abstract_action) + hash_number = abs(hash(str(feature_vector))) + return hash_number % self.hash_size + + def state_index(self, state_space): + """Turns the state into an index for the critic tensor""" + hash_number = abs(hash(str(state_space))) + return hash_number % self.hash_size + + +class ChosenActionMetadata(NamedTuple): + """Metadata attached to every gym action""" + + abstract_action: np.int32 + actor_node: int + actor_features: ndarray + actor_state: ndarray + + def __repr__(self) -> str: + return f"[abstract_action={self.abstract_action}, actor={self.actor_node}, state={self.actor_state}]" + +class ActorCriticPolicy(Learner): + + def __init__(self, + ep: EnvironmentBounds, + gamma: float, + λ: float, + learning_rate: float, + hash_size: int + ): + + self.n_local_actions = ep.local_attacks_count + self.n_remote_actions = ep.remote_attacks_count + self.model = StateActionModel(ep) + self.gamma = gamma + self.λ = λ + self.learning_rate = learning_rate + self.hash_size = hash_size + + self.memory = Memory(ep, hash_size=hash_size) + + def parameters_as_string(self): + return f'γ={self.gamma}, lr={self.learning_rate}, λ={self.λ},\n' \ + f'hash_size={self.hash_size}' + + def all_parameters_as_string(self) -> str: + model = self.model + return f'{self.parameters_as_string()}\n' \ + f'dimension={model.state_space.flat_size()}x{model.action_space.flat_size()}, ' \ + f'Q={[f.name() for f in model.state_space.feature_selection]} ' \ + f"-> 'abstract_action'" + + def get_actor_state_vector(self, global_state: ndarray, actor_features: ndarray, target_features: Optional[ndarray]) -> ndarray: + """Turns seperate state features into one vector""" + if target_features is None: + return np.concatenate((np.array(global_state, dtype=np.float32), + np.array(actor_features, dtype=np.float32))) + else: + return np.concatenate((np.array(global_state, dtype=np.float32), + np.array(actor_features, dtype=np.float32), + np.array(target_features, dtype=np.float32))) + + def update_memory(self, + reward: float, + actor_state: ndarray, + abstract_action: int, + next_actor_state: Optional[ndarray]): + """The actor's and critic's memories are updated with reward from the action just used""" + + #The temporal difference error, δ, is calculated then used to update the actor and critic + current_state_index = self.memory.state_index(actor_state) + if next_actor_state is None: + δ = reward - self.memory.critic[0][current_state_index].item() + else: + next_state_index = self.memory.state_index(next_actor_state) + δ = reward + (self.gamma * self.memory.critic[0][next_state_index].item()) - self.memory.critic[0][current_state_index].item() + + #Update the Actor + current_state_action_index = self.memory.state_action_index(actor_state, abstract_action) + + self.memory.actor[1][current_state_action_index] += 1 + + self.memory.actor[0][current_state_action_index] += self.learning_rate * δ * self.memory.actor[1][current_state_action_index].item() + self.memory.actor[0][current_state_action_index] = round(self.memory.actor[0][current_state_action_index].item(), 5) + self.memory.actor[0][current_state_action_index] = max(0, self.memory.actor[0][current_state_action_index].item()) + self.memory.actor[0][current_state_action_index] = min(100, self.memory.actor[0][current_state_action_index].item()) + + non_zero_indicies = torch.argwhere(self.memory.actor[1]).numpy() + for i in non_zero_indicies: + self.memory.actor[1][i] = self.memory.actor[1][i].item() * self.gamma * self.λ + + #Update the Critic + self.memory.critic[1][current_state_index] += 1 + + non_zero_indicies_v = torch.argwhere(self.memory.critic[0]).numpy() + non_zero_indicies_e = torch.argwhere(self.memory.critic[1]).numpy() + non_zero_indicies = np.union1d(non_zero_indicies_v, non_zero_indicies_e) + + for i in non_zero_indicies: + + self.memory.critic[0][i] = self.memory.critic[0][i].item() + (self.learning_rate * δ * self.memory.critic[1][i].item()) + self.memory.critic[1][i] = self.memory.critic[1][i].item() * self.gamma * self.λ + self.memory.critic[0][i] = max(0, self.memory.critic[0][i].item()) + + def on_step(self, wrapped_env: w.AgentWrapper, reward: float, done: bool, action_metadata): + + if done: + self.update_memory(reward, + actor_state=action_metadata.actor_state, + abstract_action=action_metadata.abstract_action, + next_actor_state=None + ) + else: + self.update_memory(reward, + actor_state=action_metadata.actor_state, + abstract_action=action_metadata.abstract_action, + next_actor_state=wrapped_env.state + ) + + + def new_episode(self): + torch.mul(self.memory.actor[1], 0) + torch.mul(self.memory.critic[1], 0) + + def end_of_episode(self, i_episode, t): + return None + + def end_of_iteration(self, t, done): + return None + + def metadata_from_gymaction(self, wrapped_env: w.AgentWrapper, gym_action): + """Takes in a gym action and returns it's metadata""" + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + actor_node = cyberbattle_env.sourcenode_of_action(gym_action) + actor_features = self.model.source_node_features.get(wrapped_env.state, actor_node) + abstract_action = self.model.action_space.abstract_from_gymaction(gym_action) + + if 'remote_vulnerability' in gym_action: + target_node = self.model.target_node_features.get(wrapped_env.state, gym_action['remote_vulnerability'][1]) + else: + target_node = None + + return ChosenActionMetadata( + abstract_action=abstract_action, + actor_node=actor_node, + actor_features=actor_features, + actor_state=self.get_actor_state_vector(current_global_state, actor_features, target_node)) + + def get_action(self, wrapped_env: w.AgentWrapper, observation, exploit) -> Tuple[str, Optional[cyberbattle_env.Action], object, float]: + """Uses Gibbs Softmax distribution to select the next action to be used""" + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation) + + #The p_values are the estimated returns from the actor function of taking the action in the current state + p_values = [] + for item in valid_nodes_and_actions: + source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0]) + + if item[1] < self.n_local_actions or item[1] - self.n_local_actions > self.n_remote_actions: + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None) + else: + target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2]) + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features) + + action_state_index = self.memory.state_action_index(actor_state_vector, item[1]) + + p_values.append(self.memory.actor[0][action_state_index].item()) + + if exploit: + indicies_of_chosen_actions = [i for i, x in enumerate(p_values) if x == max(p_values)] + chosen_action_index = random.choice(indicies_of_chosen_actions) + chosen_action = valid_nodes_and_actions[chosen_action_index] + + else: + softmax_denominator = 0 + for p_value in p_values: + softmax_denominator += math.exp(p_value) + + probabilities = [] + for p_value in p_values: + probabilities.append(math.exp(p_value) / softmax_denominator) + + chosen_action = random.choices(valid_nodes_and_actions, weights=probabilities, k=1)[0] + + if chosen_action[1] < self.n_local_actions or chosen_action[1] - self.n_local_actions > self.n_remote_actions: + gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], None) + else: + gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], chosen_action[2]) + + metadata = self.metadata_from_gymaction(wrapped_env, gym_action) + + return gym_action, metadata \ No newline at end of file diff --git a/Eligibility Trace agents/Actor-Critic/agent_wrapper.py b/Eligibility Trace agents/Actor-Critic/agent_wrapper.py new file mode 100644 index 00000000..985aa1e9 --- /dev/null +++ b/Eligibility Trace agents/Actor-Critic/agent_wrapper.py @@ -0,0 +1,342 @@ +from cyberbattle._env.cyberbattle_env import EnvironmentBounds +from typing import Optional, List +import enum +import numpy as np +from gym import spaces, Wrapper +from numpy import ndarray +import cyberbattle._env.cyberbattle_env as cyberbattle_env +import logging + + +class StateAugmentation: + """Default agent state augmentation, consisting of the gym environment + observation itself and nothing more.""" + + def __init__(self, observation: cyberbattle_env.Observation): + self.observation = observation + + def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation): + self.observation = observation + + def on_reset(self, observation: cyberbattle_env.Observation): + self.observation = observation + + +class Feature(spaces.MultiDiscrete): + """ + Feature consisting of multiple discrete dimensions. + Parameters: + nvec: is a vector defining the number of possible values + for each discrete space. + """ + + def __init__(self, env_properties: EnvironmentBounds, nvec): + self.env_properties = env_properties + super().__init__(nvec) + + def flat_size(self): + return np.prod(self.nvec) + + def name(self): + """Return the name of the feature""" + p = len(type(Feature(self.env_properties, [])).__name__) + 1 + return type(self).__name__[p:] + + def get(self, a: StateAugmentation, node: Optional[int]) -> np.ndarray: + """Compute the current value of a feature value at + the current observation and specific node""" + raise NotImplementedError + + def pretty_print(self, v): + return v + +class Feature_active_node_properties(Feature): + """Bitmask of all properties set for the active node""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [2] * p.property_count) + + def get(self, a: StateAugmentation, node) -> ndarray: + assert node is not None, 'feature only valid in the context of a node' + + node_prop = a.observation['discovered_nodes_properties'] + + # list of all properties set/unset on the node + # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0) + assert node < len(node_prop), f'invalid node index {node} (not discovered yet)' + remapped = np.array((1 + node_prop[node]) / 2, dtype=int) + return remapped + + +class Feature_active_node_id(Feature): + """Return the node id itself""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count] * 1) + + def get(self, a: StateAugmentation, node) -> ndarray: + return np.array([node], dtype=int) + + +class Feature_discovered_credential_count(Feature): + """number of credentials discovered so far""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_total_credentials + 1]) + + def get(self, a: StateAugmentation, node): + return [len(a.observation['credential_cache_matrix'])] + + +class Feature_discovered_not_owned_nodes_sliding(Feature): + """array of which of discovered nodes not owned by name""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count + 1]) + + def get(self, a: StateAugmentation, node): + discovered = a.observation['discovered_nodes'] + levels = a.observation['nodes_privilegelevel'] + owned_nodes_indices = np.where(levels > 0)[0] + owned = [] + for i in owned_nodes_indices: + owned.append(discovered[i]) + discovered_not_owned = [] + for node in discovered: + if node not in owned: + discovered_not_owned.append(node) + discovered_not_owned_sliding = np.zeros(self.env_properties.maximum_node_count, np.int32) + for node_id in discovered_not_owned: + if node_id == 'client': + discovered_not_owned_sliding[0] = 1 + elif node_id == 'Website': + discovered_not_owned_sliding[1] = 1 + elif node_id == 'Website.Directory': + discovered_not_owned_sliding[2] = 1 + elif node_id == 'Website[user=monitor]': + discovered_not_owned_sliding[3] = 1 + elif node_id == 'GitHubProject': + discovered_not_owned_sliding[4] = 1 + elif node_id == 'AzureStorage': + discovered_not_owned_sliding[5] = 1 + elif node_id == 'Sharepoint': + discovered_not_owned_sliding[6] = 1 + elif node_id == 'AzureResourceManager': + discovered_not_owned_sliding[7] = 1 + elif node_id == 'AzureResourceManager[user-monitor]': + discovered_not_owned_sliding[8] = 1 + elif node_id == 'AzureVM': + discovered_not_owned_sliding[9] = 1 + return discovered_not_owned_sliding + +class Feature_active_node_id(Feature): + """number asigned to each type of node in toy-ctf""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count + 1]) + + def get(self, a: StateAugmentation, node): + node_id = a.observation['discovered_nodes'][node] + + node_id_array = np.zeros(1, np.int32) + if node_id == 'client': + node_id_array[0] = 0 + elif node_id == 'Website': + node_id_array[0] = 1 + elif node_id == 'Website.Directory': + node_id_array[0] = 2 + elif node_id == 'Website[user=monitor]': + node_id_array[0] = 3 + elif node_id == 'GitHubProject': + node_id_array[0] = 4 + elif node_id == 'AzureStorage': + node_id_array[0] = 5 + elif node_id == 'Sharepoint': + node_id_array[0] = 6 + elif node_id == 'AzureResourceManager': + node_id_array[0] = 7 + elif node_id == 'AzureResourceManager[user-monitor]': + node_id_array[0] = 8 + elif node_id == 'AzureVM': + node_id_array[0] = 9 + else: + node_id_array[0] = 10 + return node_id_array + + +class ConcatFeatures(Feature): + """ Concatenate a list of features into a single feature + Parameters: + feature_selection - a selection of features to combine + """ + + def __init__(self, p: EnvironmentBounds, feature_selection: List[Feature]): + self.feature_selection = feature_selection + self.dim_sizes = np.concatenate([f.nvec for f in feature_selection]) + super().__init__(p, [self.dim_sizes]) + + def pretty_print(self, v): + return v + + def get(self, a: StateAugmentation, node=None) -> np.ndarray: + """Return the feature vector""" + feature_vector = [f.get(a, node) for f in self.feature_selection] + return np.concatenate(feature_vector) + + +def owned_nodes(observation): + """Return the list of owned nodes""" + return np.nonzero(observation['nodes_privilegelevel'])[0] + + +def discovered_nodes_notowned(observation): + """Return the list of discovered nodes that are not owned yet""" + return np.nonzero(observation['nodes_privilegelevel'] == 0)[0] + + +class AbstractAction(Feature): + """An abstraction of the gym state space that reduces + the space dimension for learning use to just + - local_attack(vulnid) (source_node provided) + - remote_attack(vulnid) (source_node provided, target_node forgotten) + - connect(port) (source_node provided, target_node forgotten, credentials infered from cache) + """ + + def __init__(self, p: EnvironmentBounds): + self.n_local_actions = p.local_attacks_count + self.n_remote_actions = p.remote_attacks_count + self.n_connect_actions = p.port_count + self.n_actions = self.n_local_actions + self.n_remote_actions + self.n_connect_actions + super().__init__(p, [self.n_actions]) + + def abstract_to_gymaction(self, source_node, observation, abstract_action, target_node): + """Takes a statring node and an abstract action number and returns a gym action""" + + if abstract_action < self.n_local_actions: + vuln = abstract_action + return {'local_vulnerability': np.array([source_node, vuln])} + + node_prop = observation['discovered_nodes_properties'] + abstract_action -= self.n_local_actions + if abstract_action < self.n_remote_actions: + vuln = abstract_action + + discovered_nodes_count = len(node_prop) + if discovered_nodes_count <= 1: + return None + + return {'remote_vulnerability': np.array([source_node, target_node, vuln])} + + abstract_action -= self.n_remote_actions + port = np.int32(abstract_action) + + discovered_credentials = np.array(observation['credential_cache_matrix']) + n_discovered_creds = len(discovered_credentials) + if n_discovered_creds <= 0: + return None + + nodes_not_owned = discovered_nodes_notowned(observation) + match_port = discovered_credentials[:, 1] == port + match_port_indicies = np.where(match_port)[0] + + credential_indices_choices = [c for c in match_port_indicies + if discovered_credentials[c, 0] in nodes_not_owned] + + if credential_indices_choices: + logging.debug('found matching cred in the credential cache') + else: + logging.debug('no cred matching requested port, trying instead creds used to access other ports') + credential_indices_choices = [i for (i, n) in enumerate(discovered_credentials[:, 0]) + if n in nodes_not_owned] + + if credential_indices_choices: + logging.debug('found cred in the credential cache without matching port name') + else: + logging.debug('no cred to use from the credential cache') + return None + + cred = np.int32(np.random.choice(credential_indices_choices)) + target = np.int32(discovered_credentials[cred, 0]) + return {'connect': np.array([source_node, target, port, cred], dtype=np.int32)} + + def abstract_from_gymaction(self, gym_action: cyberbattle_env.Action) -> np.int32: + """Turns a gym action into it's abstract action number""" + if 'local_vulnerability' in gym_action: + return gym_action['local_vulnerability'][1] + elif 'remote_vulnerability' in gym_action: + r = gym_action['remote_vulnerability'] + return self.n_local_actions + r[2] + + assert 'connect' in gym_action + c = gym_action['connect'] + + a = self.n_local_actions + self.n_remote_actions + c[2] + assert a < self.n_actions + return np.int32(a) + + +class ActionTrackingStateAugmentation(StateAugmentation): + """An agent state augmentation consisting of + the environment observation augmented with the following dynamic information: + - success_action_count: count of action taken and succeeded at the current node + - failed_action_count: count of action taken and failed at the current node + """ + + def __init__(self, p: EnvironmentBounds, observation: cyberbattle_env.Observation): + self.aa = AbstractAction(p) + self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.env_properties = p + super().__init__(observation) + + def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation): + node = cyberbattle_env.sourcenode_of_action(action) + abstract_action = self.aa.abstract_from_gymaction(action) + if reward > 0: + self.success_action_count[node, abstract_action] += 1 + else: + self.failed_action_count[node, abstract_action] += 1 + super().on_step(action, reward, done, observation) + + def on_reset(self, observation: cyberbattle_env.Observation): + p = self.env_properties + self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + super().on_reset(observation) + + +class Feature_success_actions_at_node(Feature): + """number of time each action succeeded at a given node""" + + max_action_count = 100 + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [self.max_action_count] * AbstractAction(p).n_actions) + + def get(self, a: ActionTrackingStateAugmentation, node: int): + return np.minimum(a.success_action_count[node, :], self.max_action_count - 1) + + +class Verbosity(enum.Enum): + """Verbosity of the learning function""" + Quiet = 0 + Normal = 1 + Verbose = 2 + + +class AgentWrapper(Wrapper): + """Gym wrapper to update the agent state on every step""" + + def __init__(self, env: cyberbattle_env.CyberBattleEnv, state: StateAugmentation): + super().__init__(env) + self.state = state + + def step(self, action: cyberbattle_env.Action): + observation, reward, done, info = self.env.step(action) + self.state.on_step(action, reward, done, observation) + return observation, reward, done, info + + def reset(self): + observation = self.env.reset() + self.state.on_reset(observation) + return observation \ No newline at end of file diff --git a/Eligibility Trace agents/Actor-Critic/learner.py b/Eligibility Trace agents/Actor-Critic/learner.py new file mode 100644 index 00000000..60ad359c --- /dev/null +++ b/Eligibility Trace agents/Actor-Critic/learner.py @@ -0,0 +1,232 @@ +from cmath import pi +import math +import sys + +from plotting import PlotTraining, plot_averaged_cummulative_rewards +from agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation +import logging +import numpy as np +from cyberbattle._env import cyberbattle_env +from typing import Tuple, Optional, TypedDict, List +import progressbar +import abc + +class Agent(abc.ABC): + + @abc.abstractmethod + def get_action(self, wrapped_env: AgentWrapper, observation, exploit) -> Tuple[str, Optional[cyberbattle_env.Action], object, float]: + """Exploit function. + Returns (action_type, gym_action, action_metadata) where + action_metadata is a custom object that gets passed to the on_step callback function""" + raise NotImplementedError + + @abc.abstractmethod + def on_step(self, wrapped_env: AgentWrapper, reward, done, action_metadata,π) -> None: + raise NotImplementedError + + def parameters_as_string(self) -> str: + return '' + + def all_parameters_as_string(self) -> str: + return '' + + def loss_as_string(self) -> str: + return '' + + def stateaction_as_string(self, action_metadata) -> str: + return '' + +Breakdown = TypedDict('Breakdown', { + 'local': int, + 'remote': int, + 'connect': int +}) + +Outcomes = TypedDict('Outcomes', { + 'reward': Breakdown, + 'noreward': Breakdown +}) + +Stats = TypedDict('Stats', { + 'exploit': Outcomes, + 'explore': Outcomes, + 'exploit_deflected_to_explore': int +}) + +TrainedAgent = TypedDict('TrainedAgent', { + 'all_episodes_rewards': List[List[float]], + 'all_episodes_availability': List[List[float]], + 'agent': Agent, + 'trained_on': str, + 'title': str +}) + +def print_stats(stats): + """Print learning statistics""" + print(" Breakdown [Reward/NoReward (Success rate)]") + def ratio(kind: str) -> str: + x, y = stats['reward'][kind], stats['noreward'][kind] + sum = x + y + if sum == 0: + return 'NaN' + else: + return f"{(x / sum):.2f}" + + def print_kind(kind: str): + print( + f" {kind}: {stats['reward'][kind]}/{stats['noreward'][kind]} " + f"({ratio(kind)})") + print_kind('local') + print_kind('remote') + print_kind('connect') + +def gibbs_softmax_search( + cyberbattle_gym_env: cyberbattle_env.CyberBattleEnv, + environment_properties: EnvironmentBounds, + agent: Agent, + title: str, + episode_count: int, + iteration_count: int, + exploit: bool, + render=True, + render_last_episode_rewards_to: Optional[str] = None, + verbosity: Verbosity = Verbosity.Normal, + plot_episodes_length=True +) -> TrainedAgent: + + print(f"###### {title}\n" + f"Learning with: episode_count={episode_count}," + f"iteration_count={iteration_count}," + + f"{agent.parameters_as_string()}") + + all_episodes_rewards = [] + all_episodes_availability = [] + + wrapped_env = AgentWrapper(cyberbattle_gym_env, + ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset())) + steps_done = 0 + plot_title = f"{title} (epochs={episode_count}" \ + + agent.parameters_as_string() + plottraining = PlotTraining(title=plot_title, render_each_episode=render) + + render_file_index = 1 + + for i_episode in range(1, episode_count + 1): + + print(f"\n ## Episode: {i_episode}/{episode_count} '{title}' " + f"{agent.parameters_as_string()}") + + observation = wrapped_env.reset() + total_reward = 0.0 + all_rewards = [] + all_availability = [] + agent.new_episode() + + stats = Stats(Outcomes(reward=Breakdown(local=0, remote=0, connect=0), + noreward=Breakdown(local=0, remote=0, connect=0)) + ) + + episode_ended_at = None + sys.stdout.flush() + + bar = progressbar.ProgressBar( + widgets=[ + 'Episode ', + f'{i_episode}', + '|Iteration ', + progressbar.Counter(), + '|', + progressbar.Variable(name='reward', width=6, precision=10), + '|', + progressbar.Variable(name='last_reward_at', width=4), + '|', + progressbar.Timer(), + progressbar.Bar() + ], + redirect_stdout=False) + + for t in bar(range(1, 1 + iteration_count)): + + steps_done += 1 + + gym_action, action_metadata = agent.get_action(wrapped_env, observation, exploit) + + # Take the step + logging.debug(f"gym_action={gym_action}, action_metadata={action_metadata}") + observation, reward, done, info = wrapped_env.step(gym_action) + + outcome = 'reward' if reward > 0 else 'noreward' + if 'local_vulnerability' in gym_action: + stats[outcome]['local'] += 1 + elif 'remote_vulnerability' in gym_action: + stats[outcome]['remote'] += 1 + else: + stats[outcome]['connect'] += 1 + + agent.on_step(wrapped_env, reward, done, action_metadata) + assert np.shape(reward) == () + + all_rewards.append(reward) + all_availability.append(info['network_availability']) + total_reward += reward + bar.update(t, reward=total_reward) + if reward > 0: + bar.update(t, last_reward_at=t) + + if verbosity == Verbosity.Verbose or (verbosity == Verbosity.Normal and reward > 0): + sign = ['-', '+'][reward > 0] + + print(f" {sign} t={t} r={reward} cum_reward:{total_reward} " + f"a={action_metadata}-{gym_action} " + f"creds={len(observation['credential_cache_matrix'])} " + f" {agent.stateaction_as_string(action_metadata)}") + + if i_episode == episode_count \ + and render_last_episode_rewards_to is not None \ + and reward > 0: + fig = cyberbattle_gym_env.render_as_fig() + fig.write_image(f"{render_last_episode_rewards_to}-e{i_episode}-{render_file_index}.png") + render_file_index += 1 + + agent.end_of_iteration(t, done) + + if done: + episode_ended_at = t + bar.finish(dirty=True) + break + + sys.stdout.flush() + + loss_string = agent.loss_as_string() + if loss_string: + loss_string = "loss={loss_string}" + + if episode_ended_at: + print(f" Episode {i_episode} ended at t={episode_ended_at} {loss_string}") + else: + print(f" Episode {i_episode} stopped at t={iteration_count} {loss_string}") + + print_stats(stats) + + all_episodes_rewards.append(all_rewards) + all_episodes_availability.append(all_availability) + + length = episode_ended_at if episode_ended_at else iteration_count + agent.end_of_episode(i_episode=i_episode, t=length) + if plot_episodes_length: + plottraining.episode_done(length) + if render: + wrapped_env.render() + + wrapped_env.close() + print("simulation ended") + if plot_episodes_length: + plottraining.plot_end() + + return TrainedAgent( + all_episodes_rewards=all_episodes_rewards, + all_episodes_availability=all_episodes_availability, + agent=agent, + trained_on=cyberbattle_gym_env.name, + title=plot_title + ) \ No newline at end of file diff --git a/Eligibility Trace agents/Actor-Critic/plotting.py b/Eligibility Trace agents/Actor-Critic/plotting.py new file mode 100644 index 00000000..e51b4a55 --- /dev/null +++ b/Eligibility Trace agents/Actor-Critic/plotting.py @@ -0,0 +1,203 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Plotting helpers for agent banchmarking""" + +import matplotlib.pyplot as plt # type:ignore +import numpy as np + + +def new_plot(title): + """Prepare a new plot of cumulative rewards""" + plt.figure(figsize=(10, 8)) + plt.ylabel('cumulative reward', fontsize=20) + plt.xlabel('step', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title(title, fontsize=12) + + +def pad(array, length): + """Pad an array with 0s to make it of desired length""" + padding = np.zeros((length,)) + padding[:len(array)] = array + return padding + + +def plot_episodes_rewards_averaged(results): + """Plot cumulative rewards for a given set of specified episodes""" + max_iteration_count = np.max([len(r) for r in results['all_episodes_rewards']]) + + all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in results['all_episodes_rewards']] + cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1) + avg = np.average(cumrewards, axis=0) + std = np.std(cumrewards, axis=0) + x = [i for i in range(len(std))] + plt.plot(x, avg, label=results['title']) + plt.fill_between(x, avg - std, avg + std, alpha=0.5) + + +def fill_with_latest_value(array, length): + pad = length - len(array) + if pad > 0: + return np.pad(array, (0, pad), mode='edge') + else: + return array + + +def plot_episodes_availability_averaged(results): + """Plot availability for a given set of specified episodes""" + data = results['all_episodes_availability'] + longest_episode_length = np.max([len(r) for r in data]) + + all_episodes_padded = [fill_with_latest_value(av, longest_episode_length) for av in data] + avg = np.average(all_episodes_padded, axis=0) + std = np.std(all_episodes_padded, axis=0) + x = [i for i in range(len(std))] + plt.plot(x, avg, label=results['title']) + plt.fill_between(x, avg - std, avg + std, alpha=0.5) + + +def plot_episodes_length(learning_results): + """Plot length of every episode""" + plt.figure(figsize=(10, 8)) + plt.ylabel('#iterations', fontsize=20) + plt.xlabel('episode', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title("Length of each episode", fontsize=12) + + for results in learning_results: + iterations = [len(e) for e in results['all_episodes_rewards']] + episode = [i for i in range(len(results['all_episodes_rewards']))] + plt.plot(episode, iterations, label=f"{results['title']}") + + plt.legend(loc="upper right") + plt.show() + + +def plot_each_episode(results): + """Plot cumulative rewards for each episode""" + for i, episode in enumerate(results['all_episodes_rewards']): + cumrewards = np.cumsum(episode) + x = [i for i in range(len(cumrewards))] + plt.plot(x, cumrewards, label=f'Episode {i}') + + +def plot_all_episodes(r): + """Plot cumulative rewards for every episode""" + new_plot(r['title']) + plot_each_episode(r) + plt.legend(loc="lower right") + plt.show() + + +def plot_averaged_cummulative_rewards(title, all_runs): + """Plot averaged cumulative rewards""" + new_plot(title) + for r in all_runs: + plot_episodes_rewards_averaged(r) + plt.legend(loc="lower right") + plt.show() + + +def plot_averaged_availability(title, all_runs): + """Plot averaged network availability""" + plt.figure(figsize=(10, 8)) + plt.ylabel('network availability', fontsize=20) + plt.xlabel('step', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title(title, fontsize=12) + for r in all_runs: + plot_episodes_availability_averaged(r) + plt.legend(loc="lower right") + plt.show() + + +def new_plot_loss(): + """Plot MSE loss averaged over all episodes""" + plt.figure(figsize=(10, 8)) + plt.ylabel('loss', fontsize=20) + plt.xlabel('episodes', fontsize=20) + plt.xticks(size=12) + plt.yticks(size=20) + plt.title("Loss", fontsize=12) + + +def plot_all_episodes_loss(all_episodes_losses, name, label): + """Plot loss for one learning episode""" + x = [i for i in range(len(all_episodes_losses))] + plt.plot(x, all_episodes_losses, label=f'{name} {label}') + + +def running_mean(x, size): + """return moving average of x for a window of lenght 'size'""" + cumsum = np.cumsum(np.insert(x, 0, 0)) + return (cumsum[size:] - cumsum[:-size]) / float(size) + + +class PlotTraining: + """Plot training-related stats""" + + def __init__(self, title, render_each_episode): + self.episode_durations = [] + self.title = title + self.render_each_episode = render_each_episode + + def plot_durations(self, average_window=5): + # plt.figure(2) + plt.figure() + # plt.clf() + durations_t = np.array(self.episode_durations, dtype=np.float32) + plt.title('Training...') + plt.xlabel('Episode') + plt.ylabel('Duration') + plt.title(self.title, fontsize=12) + + episodes = [i + 1 for i in range(len(self.episode_durations))] + plt.plot(episodes, durations_t) + # plot episode running averages + if len(durations_t) >= average_window: + means = running_mean(durations_t, average_window) + means = np.concatenate((np.zeros(average_window - 1), means)) + plt.plot(episodes, means) + + # display.display(plt.gcf()) + plt.show() + + def episode_done(self, length): + self.episode_durations.append(length) + if self.render_each_episode: + self.plot_durations() + + def plot_end(self): + self.plot_durations() + plt.ioff() # type: ignore + # plt.show() + + +def length_of_all_episodes(run): + """Get the length of every episode""" + return [len(e) for e in run['all_episodes_rewards']] + + +def reduce(x, desired_width): + return [np.average(c) for c in np.array_split(x, desired_width)] + + +def episodes_rewards_averaged(run): + """Plot cumulative rewards for a given set of specified episodes""" + max_iteration_count = np.max([len(r) for r in run['all_episodes_rewards']]) + all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in run['all_episodes_rewards']] + cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1) + avg = np.average(cumrewards, axis=0) + return list(avg) + + +def episodes_lengths_for_all_runs(all_runs): + return [length_of_all_episodes(run) for run in all_runs] + + +def averaged_cummulative_rewards(all_runs, width): + return [reduce(episodes_rewards_averaged(run), width) for run in all_runs] diff --git a/Eligibility Trace agents/Notebooks/.DS_Store b/Eligibility Trace agents/Notebooks/.DS_Store new file mode 100644 index 00000000..bfaabc34 Binary files /dev/null and b/Eligibility Trace agents/Notebooks/.DS_Store differ diff --git a/Eligibility Trace agents/Notebooks/Actor Critic Testing Basic.ipynb b/Eligibility Trace agents/Notebooks/Actor Critic Testing Basic.ipynb new file mode 100644 index 00000000..9c73413c --- /dev/null +++ b/Eligibility Trace agents/Notebooks/Actor Critic Testing Basic.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fae4d125", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "import gym\n", + "from cyberbattle._env.cyberbattle_env import AttackerGoal\n", + "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Actor-Critic')\n", + "import agent_actor_critic as a\n", + "import agent_wrapper as w\n", + "import learner as learner\n", + "from agent_wrapper import Verbosity\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3928c9e", + "metadata": {}, + "outputs": [], + "source": [ + "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n", + " attacker_goal=AttackerGoal(\n", + " reward=430,\n", + " own_atleast_percent=0.6))\n", + "\n", + "ep = w.EnvironmentBounds.of_identifiers(\n", + " maximum_node_count=12,\n", + " maximum_total_credentials=10,\n", + " identifiers=gym_env.identifiers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fca12ff", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_five_episodes = learner.gibbs_softmax_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.ActorCriticPolicy(\n", + " ep,\n", + " gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n", + " episode_count=5,\n", + " iteration_count=1000,\n", + " exploit=False,\n", + " render=False,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Actor-Critic five\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7cc52d", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_exploit_five = learner.gibbs_softmax_search(\n", + "gym_env,\n", + "ep,\n", + "learner=actor_critic_five_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "exploit=True,\n", + "render=False,\n", + "plot_episodes_length=True,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Actor_Critic five\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26cefa42", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_ten_episodes = learner.gibbs_softmax_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.ActorCriticPolicy(\n", + " ep,\n", + " gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n", + " episode_count=10,\n", + " iteration_count=1000,\n", + " exploit=False,\n", + " render=False,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Actor-Critic ten\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "809d50f5", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_exploit_ten = learner.gibbs_softmax_search(\n", + "gym_env,\n", + "ep,\n", + "learner=actor_critic_ten_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "exploit=True,\n", + "render=False,\n", + "plot_episodes_length=True,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Actor_Critic ten\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f68a6d", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_fifteen_episodes = learner.gibbs_softmax_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.ActorCriticPolicy(\n", + " ep,\n", + " gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n", + " episode_count=15,\n", + " iteration_count=1000,\n", + " exploit=False,\n", + " render=False,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Actor-Critic fifteen\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0614bcc", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_exploit_fifteen = learner.gibbs_softmax_search(\n", + "gym_env,\n", + "ep,\n", + "learner=actor_critic_fifteen_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "exploit=True,\n", + "render=False,\n", + "plot_episodes_length=True,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Actor_Critic fifteen\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "723eba90", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Eligibility Trace agents/Notebooks/Actor Critic Testing With Defender.ipynb b/Eligibility Trace agents/Notebooks/Actor Critic Testing With Defender.ipynb new file mode 100644 index 00000000..6f1d885e --- /dev/null +++ b/Eligibility Trace agents/Notebooks/Actor Critic Testing With Defender.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "19563726", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "import gym\n", + "import importlib\n", + "\n", + "from cyberbattle._env.defender import ScanAndReimageCompromisedMachines\n", + "from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint\n", + "import cyberbattle.agents.baseline.plotting as p\n", + "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Actor-Critic')\n", + "import agent_actor_critic as a\n", + "import agent_wrapper as w\n", + "import learner as learner\n", + "from agent_wrapper import Verbosity\n", + "\n", + "importlib.reload(learner)\n", + "importlib.reload(p)\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2793e70b", + "metadata": {}, + "outputs": [], + "source": [ + "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n", + " attacker_goal=AttackerGoal(\n", + " #reward=430,\n", + " own_atleast_percent=1),\n", + " defender_constraint=DefenderConstraint(\n", + " maintain_sla=0.80\n", + " ),\n", + " defender_agent=ScanAndReimageCompromisedMachines(\n", + " probability=0.6,\n", + " scan_capacity=2,\n", + " scan_frequency=5))\n", + "\n", + "ep = w.EnvironmentBounds.of_identifiers(\n", + " maximum_node_count=12,\n", + " maximum_total_credentials=10,\n", + " identifiers=gym_env.identifiers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cac15ff", + "metadata": {}, + "outputs": [], + "source": [ + "iteration_count = 1000\n", + "training_episode_count = 15" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5523c34", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_with_defender = learner.gibbs_softmax_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.ActorCriticPolicy(\n", + " ep,\n", + " gamma=0.01, λ=0.1, learning_rate=0.1, hash_size=98689),\n", + " episode_count=training_episode_count,\n", + " iteration_count=iteration_count,\n", + " exploit=False,\n", + " render=False,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Actor-Critic with defender\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e5f58fe", + "metadata": {}, + "outputs": [], + "source": [ + "actor_critic_exploit_with_defender = learner.gibbs_softmax_search(\n", + "gym_env,\n", + "ep,\n", + "learner=actor_critic_with_defender['learner'],\n", + "episode_count=10,\n", + "iteration_count=1000,\n", + "exploit=True,\n", + "render=False,\n", + "plot_episodes_length=True,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Actor_Critic with defender\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06793ce8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing Basic.ipynb b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing Basic.ipynb new file mode 100644 index 00000000..187e4176 --- /dev/null +++ b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing Basic.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fae4d125", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "import gym\n", + "from cyberbattle._env.cyberbattle_env import AttackerGoal\n", + "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Sarsa(Lambda)')\n", + "import agent_sarsa_lambda as a\n", + "import agent_wrapper as w\n", + "import learner as learner\n", + "from agent_wrapper import Verbosity\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3928c9e", + "metadata": {}, + "outputs": [], + "source": [ + "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n", + " attacker_goal=AttackerGoal(\n", + " reward=430,\n", + " own_atleast_percent=0.6))\n", + "\n", + "ep = w.EnvironmentBounds.of_identifiers(\n", + " maximum_node_count=12,\n", + " maximum_total_credentials=10,\n", + " identifiers=gym_env.identifiers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fca12ff", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_five_episodes = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.SarsaLambdaPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=5,\n", + " iteration_count=1000,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Sarsa_Lambda five\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7cc52d", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_exploit_five = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=sarsa_lambda_five_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Sarsa_Lambda five\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26cefa42", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_ten_episodes = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.SarsaLambdaPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=10,\n", + " iteration_count=1000,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Sarsa_Lambda ten\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "809d50f5", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_exploit_ten = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=sarsa_lambda_ten_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Sarsa_Lambda ten\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f68a6d", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_fifteen_episodes = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.SarsaLambdaPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=15,\n", + " iteration_count=1000,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Sarsa_Lambda fifteen\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0614bcc", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_exploit_fifteen = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=sarsa_lambda_fifteen_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Sarsa_Lambda fifteen\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09551250", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing With Defender.ipynb b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing With Defender.ipynb new file mode 100644 index 00000000..6dea57de --- /dev/null +++ b/Eligibility Trace agents/Notebooks/Sarsa Lambda Testing With Defender.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "19563726", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "import gym\n", + "import importlib\n", + "\n", + "from cyberbattle._env.defender import ScanAndReimageCompromisedMachines\n", + "from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint\n", + "import cyberbattle.agents.baseline.plotting as p\n", + "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Sarsa(Lambda)')\n", + "import agent_sarsa_lambda as a\n", + "import agent_wrapper as w\n", + "import learner as learner\n", + "from agent_wrapper import Verbosity\n", + "\n", + "importlib.reload(learner)\n", + "importlib.reload(p)\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2793e70b", + "metadata": {}, + "outputs": [], + "source": [ + "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n", + " attacker_goal=AttackerGoal(\n", + " #reward=430,\n", + " own_atleast_percent=1),\n", + " defender_constraint=DefenderConstraint(\n", + " maintain_sla=0.80\n", + " ),\n", + " defender_agent=ScanAndReimageCompromisedMachines(\n", + " probability=0.6,\n", + " scan_capacity=2,\n", + " scan_frequency=5))\n", + "\n", + "ep = w.EnvironmentBounds.of_identifiers(\n", + " maximum_node_count=12,\n", + " maximum_total_credentials=10,\n", + " identifiers=gym_env.identifiers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cac15ff", + "metadata": {}, + "outputs": [], + "source": [ + "iteration_count = 1000\n", + "training_episode_count = 15" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5523c34", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_with_defender = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.SarsaLambdaPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=training_episode_count,\n", + " iteration_count=iteration_count,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Sarsa_Lambda with defender\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e5f58fe", + "metadata": {}, + "outputs": [], + "source": [ + "sarsa_lambda_exploit_with_defender = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=sarsa_lambda_with_defender['learner'],\n", + "episode_count=10,\n", + "iteration_count=1000,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Sarsa_lambda with defender\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06793ce8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Eligibility Trace agents/Notebooks/Watkins Q Testing Basic.ipynb b/Eligibility Trace agents/Notebooks/Watkins Q Testing Basic.ipynb new file mode 100644 index 00000000..90021ef6 --- /dev/null +++ b/Eligibility Trace agents/Notebooks/Watkins Q Testing Basic.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fae4d125", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "import gym\n", + "from cyberbattle._env.cyberbattle_env import AttackerGoal\n", + "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Watkins Q')\n", + "import agent_watkins_q as a\n", + "import agent_wrapper as w\n", + "import learner as learner\n", + "from agent_wrapper import Verbosity\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3928c9e", + "metadata": {}, + "outputs": [], + "source": [ + "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n", + " attacker_goal=AttackerGoal(\n", + " reward=430,\n", + " own_atleast_percent=0.6))\n", + "\n", + "ep = w.EnvironmentBounds.of_identifiers(\n", + " maximum_node_count=12,\n", + " maximum_total_credentials=10,\n", + " identifiers=gym_env.identifiers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fca12ff", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_five_episodes = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.WatkinsQPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=5,\n", + " iteration_count=1000,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Watkins_Q five\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7cc52d", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_exploit_five = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=watkins_q_five_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Watkins_Q five\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26cefa42", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_ten_episodes = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.WatkinsQPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=10,\n", + " iteration_count=1000,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Watkins_Q ten\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "809d50f5", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_exploit_ten = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=watkins_q_ten_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Watkins_Q ten\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f68a6d", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_fifteen_episodes = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.WatkinsQPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=15,\n", + " iteration_count=1000,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Watkins_Q fifteen\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0614bcc", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_exploit_fifteen = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=watkins_q_fifteen_episodes['learner'],\n", + "episode_count=10,\n", + "iteration_count=50,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Watkins_Q fifteen\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09551250", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Eligibility Trace agents/Notebooks/Watkins Q Testing With Defender.ipynb b/Eligibility Trace agents/Notebooks/Watkins Q Testing With Defender.ipynb new file mode 100644 index 00000000..3d7c9cf6 --- /dev/null +++ b/Eligibility Trace agents/Notebooks/Watkins Q Testing With Defender.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "19563726", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import logging\n", + "import gym\n", + "import importlib\n", + "\n", + "from cyberbattle._env.defender import ScanAndReimageCompromisedMachines\n", + "from cyberbattle._env.cyberbattle_env import AttackerGoal, DefenderConstraint\n", + "import cyberbattle.agents.baseline.plotting as p\n", + "sys.path.insert(0, '/Users/freddiebarrell/Desktop/Repository/Watkins Q')\n", + "import agent_watkins_q as a\n", + "import agent_wrapper as w\n", + "import learner as learner\n", + "from agent_wrapper import Verbosity\n", + "\n", + "importlib.reload(learner)\n", + "importlib.reload(p)\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format=\"%(levelname)s: %(message)s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2793e70b", + "metadata": {}, + "outputs": [], + "source": [ + "gym_env = gym.make(\"CyberBattleToyCtf-v0\",\n", + " attacker_goal=AttackerGoal(\n", + " #reward=430,\n", + " own_atleast_percent=1),\n", + " defender_constraint=DefenderConstraint(\n", + " maintain_sla=0.80\n", + " ),\n", + " defender_agent=ScanAndReimageCompromisedMachines(\n", + " probability=0.6,\n", + " scan_capacity=2,\n", + " scan_frequency=5))\n", + "\n", + "ep = w.EnvironmentBounds.of_identifiers(\n", + " maximum_node_count=12,\n", + " maximum_total_credentials=10,\n", + " identifiers=gym_env.identifiers\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cac15ff", + "metadata": {}, + "outputs": [], + "source": [ + "iteration_count = 1000\n", + "training_episode_count = 15" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5523c34", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_with_defender = learner.epsilon_greedy_search(\n", + " gym_env,\n", + " ep,\n", + " learner=a.WatkinsQPolicy(\n", + " ep,\n", + " gamma=0.015, λ=0.5, learning_rate=0.01, hash_size=98689),\n", + " episode_count=training_episode_count,\n", + " iteration_count=iteration_count,\n", + " epsilon=0.9,\n", + " render=False,\n", + " epsilon_exponential_decay=1000,\n", + " epsilon_minimum=0.01,\n", + " verbosity=Verbosity.Quiet,\n", + " title=\"Watkins_Q with defender\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e5f58fe", + "metadata": {}, + "outputs": [], + "source": [ + "watkins_q_exploit_with_defender = learner.epsilon_greedy_search(\n", + "gym_env,\n", + "ep,\n", + "learner=watkins_q_with_defender['learner'],\n", + "episode_count=10,\n", + "iteration_count=1000,\n", + "epsilon=0,\n", + "render=False,\n", + "epsilon_minimum=0,\n", + "verbosity=Verbosity.Quiet,\n", + "title=\"Exploiting Watkins_Q with defender\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06793ce8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Eligibility Trace agents/Sarsa(lambda)/.DS_Store b/Eligibility Trace agents/Sarsa(lambda)/.DS_Store new file mode 100644 index 00000000..07af60bf Binary files /dev/null and b/Eligibility Trace agents/Sarsa(lambda)/.DS_Store differ diff --git a/Eligibility Trace agents/Sarsa(lambda)/agent_sarsa_lambda.py b/Eligibility Trace agents/Sarsa(lambda)/agent_sarsa_lambda.py new file mode 100644 index 00000000..1423ee6d --- /dev/null +++ b/Eligibility Trace agents/Sarsa(lambda)/agent_sarsa_lambda.py @@ -0,0 +1,256 @@ +import random +from typing import NamedTuple, Optional, Tuple, Union, List +import numpy as np +from numpy import ndarray +import logging +import boolean + +from cyberbattle._env import cyberbattle_env +from agent_wrapper import EnvironmentBounds +from gym import spaces, Wrapper + +import agent_wrapper as w +from learner import Learner + +from torch import Tensor +import torch + +class StateActionModel: + + def __init__(self, ep: EnvironmentBounds): + self.ep = ep + + self.global_features = w.ConcatFeatures(ep, [ + w.Feature_discovered_not_owned_nodes_sliding(ep), + w.Feature_discovered_credential_count(ep) + ]) + + self.source_node_features = w.ConcatFeatures(ep, [ + w.Feature_active_node_properties(ep), + w.Feature_success_actions_at_node(ep) + ]) + + self.target_node_features = w.ConcatFeatures(ep, [ + w.Feature_active_node_id(ep) + ]) + + self.state_space = w.ConcatFeatures(ep, self.global_features.feature_selection + + self.source_node_features.feature_selection + + self.target_node_features.feature_selection) + + self.action_space = w.AbstractAction(ep) + + def valid_actions(self, wrapped_env: w.AgentWrapper, observation): + """returns a list of valid actions and the nodes they can be carried out from""" + + nodes_and_actions = [] + discovered_nodes = np.union1d(w.owned_nodes(observation), w.discovered_nodes_notowned(observation)) + + for from_node in w.owned_nodes(observation): + for local_action in range(self.action_space.n_local_actions): + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, local_action, None) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, local_action, -1)) + + for remote_action in range(self.action_space.n_local_actions, self.action_space.n_local_actions + self.action_space.n_remote_actions): + for target_node in discovered_nodes: + if target_node != from_node: + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, remote_action, target_node) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, remote_action, target_node)) + + for connect_action in range(self.action_space.n_local_actions + self.action_space.n_remote_actions, self.action_space.n_actions): + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, connect_action, None) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, connect_action, -1)) + + return nodes_and_actions + +class Memory: + + def __init__(self, ep:EnvironmentBounds, hash_size): + self.hash_size = hash_size + + self.memory = torch.zeros([2, hash_size], dtype=torch.float64) + + def state_action_index(self, state_space, abstract_action): + """Turns a state action pair into an index for the memory tensor""" + feature_vector = np.append(state_space, abstract_action) + hash_number = abs(hash(str(feature_vector))) + return hash_number % self.hash_size + +class ChosenActionMetadata(NamedTuple): + + abstract_action: np.int32 + actor_node: int + actor_features: ndarray + actor_state: ndarray + + def __repr__(self) -> str: + return f"[abstract_action={self.abstract_action}, actor={self.actor_node}, state={self.actor_state}]" + +class SarsaLambdaPolicy(Learner): + + def __init__(self, + ep: EnvironmentBounds, + gamma: float, + λ: float, + learning_rate: float, + hash_size: int + ): + + self.model = StateActionModel(ep) + self.n_local_actions = ep.local_attacks_count + self.n_remote_actions = ep.remote_attacks_count + self.gamma = gamma + self.λ = λ + self.learning_rate = learning_rate + self.hash_size = hash_size + + self.memory = Memory(ep, hash_size=hash_size) + + def parameters_as_string(self): + return f'γ={self.gamma}, lr={self.learning_rate}, λ={self.λ},\n' \ + f'hash_size={self.hash_size}' + + def all_parameters_as_string(self) -> str: + model = self.model + return f'{self.parameters_as_string()}\n' \ + f'dimension={model.state_space.flat_size()}x{model.action_space.flat_size()}, ' \ + f'Q={[f.name() for f in model.state_space.feature_selection]} ' \ + f"-> 'abstract_action'" + + def get_actor_state_vector(self, global_state: ndarray, actor_features: ndarray, target_features: Optional[ndarray]) -> ndarray: + """Turns seperate state features into one vector""" + if target_features is None: + return np.concatenate((np.array(global_state, dtype=np.float32), + np.array(actor_features, dtype=np.float32))) + else: + return np.concatenate((np.array(global_state, dtype=np.float32), + np.array(actor_features, dtype=np.float32), + np.array(target_features, dtype=np.float32))) + + def update_memory(self, + reward: float, + actor_state: ndarray, + abstract_action: int, + next_actor_state: Optional[ndarray], + next_abstract_action: Optional[int]): + + current_state_action_index = self.memory.state_action_index(actor_state, abstract_action) + if next_actor_state is None: + δ = reward - self.memory.memory[0][current_state_action_index].item() + else: + next_state_action_index = self.memory.state_action_index(next_actor_state, next_abstract_action) + δ = reward + (self.gamma * self.memory.memory[0][next_state_action_index].item()) - self.memory.memory[0][current_state_action_index].item() + + self.memory.memory[1][current_state_action_index] += 1 + + non_zero_indicies_q = torch.argwhere(self.memory.memory[0]).numpy() + non_zero_indicies_e = torch.argwhere(self.memory.memory[1]).numpy() + non_zero_indicies = np.union1d(non_zero_indicies_q, non_zero_indicies_e) + + for i in non_zero_indicies: + + self.memory.memory[0][i] = self.memory.memory[0][i].item() + float(self.learning_rate * δ * self.memory.memory[1][i].item()) + self.memory.memory[0][i] = round(self.memory.memory[0][i].item(), 5) + self.memory.memory[0][i] = max(0, self.memory.memory[0][i].item()) + self.memory.memory[0][i] = min(100, self.memory.memory[0][i].item()) + + self.memory.memory[1][i] = self.memory.memory[1][i].item() * float(self.gamma * self.λ) + self.memory.memory[1][i] = round(self.memory.memory[0][i].item(), 5) + + def on_step(self, wrapped_env: w.AgentWrapper, + observation, reward: float, done: bool, action_metadata, epsilon): + + if done: + self.update_memory(reward, + actor_state=action_metadata.actor_state, + abstract_action=action_metadata.abstract_action, + next_actor_state=None, + next_abstract_action=None + ) + else: + x = np.random.rand() + if x <= epsilon: + _, _, future_action_metadata = self.explore(wrapped_env) + else: + _, _, future_action_metadata = self.exploit(wrapped_env, observation) + + self.update_memory(reward, + actor_state=action_metadata.actor_state, + abstract_action=action_metadata.abstract_action, + next_actor_state=future_action_metadata.actor_state, + next_abstract_action=future_action_metadata.abstract_action + ) + + + def new_episode(self): + torch.mul(self.memory.memory[1], 0) + + def end_of_episode(self, i_episode, t): + return None + + def end_of_iteration(self, t, done): + return None + + def metadata_from_gymaction(self, wrapped_env, gym_action): + """Takes in a gym action and returns it's metadata""" + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + actor_node = cyberbattle_env.sourcenode_of_action(gym_action) + actor_features = self.model.source_node_features.get(wrapped_env.state, actor_node) + abstract_action = self.model.action_space.abstract_from_gymaction(gym_action) + + if 'remote_vulnerability' in gym_action: + target_node = self.model.target_node_features.get(wrapped_env.state, gym_action['remote_vulnerability'][1]) + else: + target_node = None + + return ChosenActionMetadata( + abstract_action=abstract_action, + actor_node=actor_node, + actor_features=actor_features, + actor_state=self.get_actor_state_vector(current_global_state, actor_features, target_node)) + + def stateaction_as_string(self, action_metadata) -> str: + return '' + + def explore(self, wrapped_env: w.AgentWrapper + ) -> Tuple[str, cyberbattle_env.Action, object]: + + gym_action = wrapped_env.env.sample_valid_action(kinds=[0, 1, 2]) + metadata = self.metadata_from_gymaction(wrapped_env, gym_action) + return gym_action, metadata + + def exploit(self, wrapped_env: w.AgentWrapper, observation) -> Tuple[str, Optional[cyberbattle_env.Action], object]: + + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation) + + #The q_values are the estimated returns from an action taken in the current state + q_values = [] + for item in valid_nodes_and_actions: + source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0]) + + if item[1] < self.n_local_actions or item[1] - self.n_local_actions > self.n_remote_actions: + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None) + else: + target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2]) + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features) + + action_state_index = self.memory.state_action_index(actor_state_vector, item[1]) + + q_values.append(self.memory.memory[0][action_state_index].item()) + + indicies_of_chosen_actions = [i for i, x in enumerate(q_values) if x == max(q_values)] + chosen_action_index = random.choice(indicies_of_chosen_actions) + chosen_action = valid_nodes_and_actions[chosen_action_index] + + if chosen_action[1] < self.n_local_actions or chosen_action[1] - self.n_local_actions > self.n_remote_actions: + gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], None) + else: + gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], chosen_action[2]) + + metadata = self.metadata_from_gymaction(wrapped_env, gym_action) + + return gym_action, metadata \ No newline at end of file diff --git a/Eligibility Trace agents/Sarsa(lambda)/agent_wrapper.py b/Eligibility Trace agents/Sarsa(lambda)/agent_wrapper.py new file mode 100644 index 00000000..985aa1e9 --- /dev/null +++ b/Eligibility Trace agents/Sarsa(lambda)/agent_wrapper.py @@ -0,0 +1,342 @@ +from cyberbattle._env.cyberbattle_env import EnvironmentBounds +from typing import Optional, List +import enum +import numpy as np +from gym import spaces, Wrapper +from numpy import ndarray +import cyberbattle._env.cyberbattle_env as cyberbattle_env +import logging + + +class StateAugmentation: + """Default agent state augmentation, consisting of the gym environment + observation itself and nothing more.""" + + def __init__(self, observation: cyberbattle_env.Observation): + self.observation = observation + + def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation): + self.observation = observation + + def on_reset(self, observation: cyberbattle_env.Observation): + self.observation = observation + + +class Feature(spaces.MultiDiscrete): + """ + Feature consisting of multiple discrete dimensions. + Parameters: + nvec: is a vector defining the number of possible values + for each discrete space. + """ + + def __init__(self, env_properties: EnvironmentBounds, nvec): + self.env_properties = env_properties + super().__init__(nvec) + + def flat_size(self): + return np.prod(self.nvec) + + def name(self): + """Return the name of the feature""" + p = len(type(Feature(self.env_properties, [])).__name__) + 1 + return type(self).__name__[p:] + + def get(self, a: StateAugmentation, node: Optional[int]) -> np.ndarray: + """Compute the current value of a feature value at + the current observation and specific node""" + raise NotImplementedError + + def pretty_print(self, v): + return v + +class Feature_active_node_properties(Feature): + """Bitmask of all properties set for the active node""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [2] * p.property_count) + + def get(self, a: StateAugmentation, node) -> ndarray: + assert node is not None, 'feature only valid in the context of a node' + + node_prop = a.observation['discovered_nodes_properties'] + + # list of all properties set/unset on the node + # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0) + assert node < len(node_prop), f'invalid node index {node} (not discovered yet)' + remapped = np.array((1 + node_prop[node]) / 2, dtype=int) + return remapped + + +class Feature_active_node_id(Feature): + """Return the node id itself""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count] * 1) + + def get(self, a: StateAugmentation, node) -> ndarray: + return np.array([node], dtype=int) + + +class Feature_discovered_credential_count(Feature): + """number of credentials discovered so far""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_total_credentials + 1]) + + def get(self, a: StateAugmentation, node): + return [len(a.observation['credential_cache_matrix'])] + + +class Feature_discovered_not_owned_nodes_sliding(Feature): + """array of which of discovered nodes not owned by name""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count + 1]) + + def get(self, a: StateAugmentation, node): + discovered = a.observation['discovered_nodes'] + levels = a.observation['nodes_privilegelevel'] + owned_nodes_indices = np.where(levels > 0)[0] + owned = [] + for i in owned_nodes_indices: + owned.append(discovered[i]) + discovered_not_owned = [] + for node in discovered: + if node not in owned: + discovered_not_owned.append(node) + discovered_not_owned_sliding = np.zeros(self.env_properties.maximum_node_count, np.int32) + for node_id in discovered_not_owned: + if node_id == 'client': + discovered_not_owned_sliding[0] = 1 + elif node_id == 'Website': + discovered_not_owned_sliding[1] = 1 + elif node_id == 'Website.Directory': + discovered_not_owned_sliding[2] = 1 + elif node_id == 'Website[user=monitor]': + discovered_not_owned_sliding[3] = 1 + elif node_id == 'GitHubProject': + discovered_not_owned_sliding[4] = 1 + elif node_id == 'AzureStorage': + discovered_not_owned_sliding[5] = 1 + elif node_id == 'Sharepoint': + discovered_not_owned_sliding[6] = 1 + elif node_id == 'AzureResourceManager': + discovered_not_owned_sliding[7] = 1 + elif node_id == 'AzureResourceManager[user-monitor]': + discovered_not_owned_sliding[8] = 1 + elif node_id == 'AzureVM': + discovered_not_owned_sliding[9] = 1 + return discovered_not_owned_sliding + +class Feature_active_node_id(Feature): + """number asigned to each type of node in toy-ctf""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count + 1]) + + def get(self, a: StateAugmentation, node): + node_id = a.observation['discovered_nodes'][node] + + node_id_array = np.zeros(1, np.int32) + if node_id == 'client': + node_id_array[0] = 0 + elif node_id == 'Website': + node_id_array[0] = 1 + elif node_id == 'Website.Directory': + node_id_array[0] = 2 + elif node_id == 'Website[user=monitor]': + node_id_array[0] = 3 + elif node_id == 'GitHubProject': + node_id_array[0] = 4 + elif node_id == 'AzureStorage': + node_id_array[0] = 5 + elif node_id == 'Sharepoint': + node_id_array[0] = 6 + elif node_id == 'AzureResourceManager': + node_id_array[0] = 7 + elif node_id == 'AzureResourceManager[user-monitor]': + node_id_array[0] = 8 + elif node_id == 'AzureVM': + node_id_array[0] = 9 + else: + node_id_array[0] = 10 + return node_id_array + + +class ConcatFeatures(Feature): + """ Concatenate a list of features into a single feature + Parameters: + feature_selection - a selection of features to combine + """ + + def __init__(self, p: EnvironmentBounds, feature_selection: List[Feature]): + self.feature_selection = feature_selection + self.dim_sizes = np.concatenate([f.nvec for f in feature_selection]) + super().__init__(p, [self.dim_sizes]) + + def pretty_print(self, v): + return v + + def get(self, a: StateAugmentation, node=None) -> np.ndarray: + """Return the feature vector""" + feature_vector = [f.get(a, node) for f in self.feature_selection] + return np.concatenate(feature_vector) + + +def owned_nodes(observation): + """Return the list of owned nodes""" + return np.nonzero(observation['nodes_privilegelevel'])[0] + + +def discovered_nodes_notowned(observation): + """Return the list of discovered nodes that are not owned yet""" + return np.nonzero(observation['nodes_privilegelevel'] == 0)[0] + + +class AbstractAction(Feature): + """An abstraction of the gym state space that reduces + the space dimension for learning use to just + - local_attack(vulnid) (source_node provided) + - remote_attack(vulnid) (source_node provided, target_node forgotten) + - connect(port) (source_node provided, target_node forgotten, credentials infered from cache) + """ + + def __init__(self, p: EnvironmentBounds): + self.n_local_actions = p.local_attacks_count + self.n_remote_actions = p.remote_attacks_count + self.n_connect_actions = p.port_count + self.n_actions = self.n_local_actions + self.n_remote_actions + self.n_connect_actions + super().__init__(p, [self.n_actions]) + + def abstract_to_gymaction(self, source_node, observation, abstract_action, target_node): + """Takes a statring node and an abstract action number and returns a gym action""" + + if abstract_action < self.n_local_actions: + vuln = abstract_action + return {'local_vulnerability': np.array([source_node, vuln])} + + node_prop = observation['discovered_nodes_properties'] + abstract_action -= self.n_local_actions + if abstract_action < self.n_remote_actions: + vuln = abstract_action + + discovered_nodes_count = len(node_prop) + if discovered_nodes_count <= 1: + return None + + return {'remote_vulnerability': np.array([source_node, target_node, vuln])} + + abstract_action -= self.n_remote_actions + port = np.int32(abstract_action) + + discovered_credentials = np.array(observation['credential_cache_matrix']) + n_discovered_creds = len(discovered_credentials) + if n_discovered_creds <= 0: + return None + + nodes_not_owned = discovered_nodes_notowned(observation) + match_port = discovered_credentials[:, 1] == port + match_port_indicies = np.where(match_port)[0] + + credential_indices_choices = [c for c in match_port_indicies + if discovered_credentials[c, 0] in nodes_not_owned] + + if credential_indices_choices: + logging.debug('found matching cred in the credential cache') + else: + logging.debug('no cred matching requested port, trying instead creds used to access other ports') + credential_indices_choices = [i for (i, n) in enumerate(discovered_credentials[:, 0]) + if n in nodes_not_owned] + + if credential_indices_choices: + logging.debug('found cred in the credential cache without matching port name') + else: + logging.debug('no cred to use from the credential cache') + return None + + cred = np.int32(np.random.choice(credential_indices_choices)) + target = np.int32(discovered_credentials[cred, 0]) + return {'connect': np.array([source_node, target, port, cred], dtype=np.int32)} + + def abstract_from_gymaction(self, gym_action: cyberbattle_env.Action) -> np.int32: + """Turns a gym action into it's abstract action number""" + if 'local_vulnerability' in gym_action: + return gym_action['local_vulnerability'][1] + elif 'remote_vulnerability' in gym_action: + r = gym_action['remote_vulnerability'] + return self.n_local_actions + r[2] + + assert 'connect' in gym_action + c = gym_action['connect'] + + a = self.n_local_actions + self.n_remote_actions + c[2] + assert a < self.n_actions + return np.int32(a) + + +class ActionTrackingStateAugmentation(StateAugmentation): + """An agent state augmentation consisting of + the environment observation augmented with the following dynamic information: + - success_action_count: count of action taken and succeeded at the current node + - failed_action_count: count of action taken and failed at the current node + """ + + def __init__(self, p: EnvironmentBounds, observation: cyberbattle_env.Observation): + self.aa = AbstractAction(p) + self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.env_properties = p + super().__init__(observation) + + def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation): + node = cyberbattle_env.sourcenode_of_action(action) + abstract_action = self.aa.abstract_from_gymaction(action) + if reward > 0: + self.success_action_count[node, abstract_action] += 1 + else: + self.failed_action_count[node, abstract_action] += 1 + super().on_step(action, reward, done, observation) + + def on_reset(self, observation: cyberbattle_env.Observation): + p = self.env_properties + self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + super().on_reset(observation) + + +class Feature_success_actions_at_node(Feature): + """number of time each action succeeded at a given node""" + + max_action_count = 100 + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [self.max_action_count] * AbstractAction(p).n_actions) + + def get(self, a: ActionTrackingStateAugmentation, node: int): + return np.minimum(a.success_action_count[node, :], self.max_action_count - 1) + + +class Verbosity(enum.Enum): + """Verbosity of the learning function""" + Quiet = 0 + Normal = 1 + Verbose = 2 + + +class AgentWrapper(Wrapper): + """Gym wrapper to update the agent state on every step""" + + def __init__(self, env: cyberbattle_env.CyberBattleEnv, state: StateAugmentation): + super().__init__(env) + self.state = state + + def step(self, action: cyberbattle_env.Action): + observation, reward, done, info = self.env.step(action) + self.state.on_step(action, reward, done, observation) + return observation, reward, done, info + + def reset(self): + observation = self.env.reset() + self.state.on_reset(observation) + return observation \ No newline at end of file diff --git a/Eligibility Trace agents/Sarsa(lambda)/learner.py b/Eligibility Trace agents/Sarsa(lambda)/learner.py new file mode 100644 index 00000000..7553e66e --- /dev/null +++ b/Eligibility Trace agents/Sarsa(lambda)/learner.py @@ -0,0 +1,269 @@ +import math +import sys + +from plotting import PlotTraining, plot_averaged_cummulative_rewards +from agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation +import logging +import numpy as np +from cyberbattle._env import cyberbattle_env +from typing import Tuple, Optional, TypedDict, List +import progressbar +import abc + +class Agent(abc.ABC): + + @abc.abstractmethod + def explore(self, wrapped_env: AgentWrapper) -> Tuple[cyberbattle_env.Action, object]: + """Exploration function. + Returns (action_type, gym_action, action_metadata) where + action_metadata is a custom object that gets passed to the on_step callback function""" + raise NotImplementedError + + @abc.abstractmethod + def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[Optional[cyberbattle_env.Action], object]: + """Exploit function. + Returns (action_type, gym_action, action_metadata) where + action_metadata is a custom object that gets passed to the on_step callback function""" + raise NotImplementedError + + @abc.abstractmethod + def on_step(self, wrapped_env: AgentWrapper, observation, reward, done, action_metadata, epsilon) -> None: + raise NotImplementedError + + def parameters_as_string(self) -> str: + return '' + + def all_parameters_as_string(self) -> str: + return '' + + def loss_as_string(self) -> str: + return '' + + def stateaction_as_string(self, action_metadata) -> str: + return '' + +Breakdown = TypedDict('Breakdown', { + 'local': int, + 'remote': int, + 'connect': int +}) + +Outcomes = TypedDict('Outcomes', { + 'reward': Breakdown, + 'noreward': Breakdown +}) + +Stats = TypedDict('Stats', { + 'exploit': Outcomes, + 'explore': Outcomes +}) + +TrainedAgent = TypedDict('TrainedAgent', { + 'all_episodes_rewards': List[List[float]], + 'all_episodes_availability': List[List[float]], + 'agent': Agent, + 'trained_on': str, + 'title': str +}) + +def print_stats(stats): + """Print learning statistics""" + def print_breakdown(stats, actiontype: str): + def ratio(kind: str) -> str: + x, y = stats[actiontype]['reward'][kind], stats[actiontype]['noreward'][kind] + sum = x + y + if sum == 0: + return 'NaN' + else: + return f"{(x / sum):.2f}" + + def print_kind(kind: str): + print( + f" {actiontype}-{kind}: {stats[actiontype]['reward'][kind]}/{stats[actiontype]['noreward'][kind]} " + f"({ratio(kind)})") + print_kind('local') + print_kind('remote') + print_kind('connect') + + print(" Breakdown [Reward/NoReward (Success rate)]") + print_breakdown(stats, 'explore') + print_breakdown(stats, 'exploit') + +def epsilon_greedy_search( + cyberbattle_gym_env: cyberbattle_env.CyberBattleEnv, + environment_properties: EnvironmentBounds, + agent: Agent, + title: str, + episode_count: int, + iteration_count: int, + epsilon: float, + epsilon_minimum=0.0, + epsilon_multdecay: Optional[float] = None, + epsilon_exponential_decay: Optional[int] = None, + render=True, + render_last_episode_rewards_to: Optional[str] = None, + verbosity: Verbosity = Verbosity.Normal, + plot_episodes_length=True +) -> TrainedAgent: + + print(f"###### {title}\n" + f"Learning with: episode_count={episode_count}," + f"iteration_count={iteration_count}," + f"ϵ={epsilon}," + f'ϵ_min={epsilon_minimum}, ' + + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') + + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') + + f"{agent.parameters_as_string()}") + + initial_epsilon = epsilon + + all_episodes_rewards = [] + all_episodes_availability = [] + + wrapped_env = AgentWrapper(cyberbattle_gym_env, + ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset())) + steps_done = 0 + plot_title = f"{title} (epochs={episode_count}, ϵ={initial_epsilon}, ϵ_min={epsilon_minimum}," \ + + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') \ + + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') \ + + agent.parameters_as_string() + plottraining = PlotTraining(title=plot_title, render_each_episode=render) + + render_file_index = 1 + + for i_episode in range(1, episode_count + 1): + + print(f" ## Episode: {i_episode}/{episode_count} '{title}' " + f"ϵ={epsilon:.4f}, " + f"{agent.parameters_as_string()}") + + observation = wrapped_env.reset() + total_reward = 0.0 + all_rewards = [] + all_availability = [] + agent.new_episode() + + stats = Stats(exploit=Outcomes(reward=Breakdown(local=0, remote=0, connect=0), + noreward=Breakdown(local=0, remote=0, connect=0)), + explore=Outcomes(reward=Breakdown(local=0, remote=0, connect=0), + noreward=Breakdown(local=0, remote=0, connect=0)) + ) + + episode_ended_at = None + sys.stdout.flush() + + bar = progressbar.ProgressBar( + widgets=[ + 'Episode ', + f'{i_episode}', + '|Iteration ', + progressbar.Counter(), + '|', + progressbar.Variable(name='reward', width=6, precision=10), + '|', + progressbar.Variable(name='last_reward_at', width=4), + '|', + progressbar.Timer(), + progressbar.Bar() + ], + redirect_stdout=False) + + for t in bar(range(1, 1 + iteration_count)): + + if epsilon_exponential_decay: + epsilon = epsilon_minimum + math.exp(-1. * steps_done / + epsilon_exponential_decay) * (initial_epsilon - epsilon_minimum) + + steps_done += 1 + + x = np.random.rand() + if x <= epsilon: + gym_action, action_metadata = agent.explore(wrapped_env) + action_style = "explore" + else: + gym_action, action_metadata = agent.exploit(wrapped_env, observation) + action_style = "exploit" + + # Take the step + logging.debug(f"gym_action={gym_action}, action_metadata={action_metadata}") + observation, reward, done, info = wrapped_env.step(gym_action) + + action_type = 'exploit' if action_style == 'exploit' else 'explore' + outcome = 'reward' if reward > 0 else 'noreward' + if 'local_vulnerability' in gym_action: + stats[action_type][outcome]['local'] += 1 + elif 'remote_vulnerability' in gym_action: + stats[action_type][outcome]['remote'] += 1 + else: + stats[action_type][outcome]['connect'] += 1 + + agent.on_step(wrapped_env, observation, reward, done, action_metadata, epsilon) + assert np.shape(reward) == () + + all_rewards.append(reward) + all_availability.append(info['network_availability']) + total_reward += reward + bar.update(t, reward=total_reward) + if reward > 0: + bar.update(t, last_reward_at=t) + + if verbosity == Verbosity.Verbose or (verbosity == Verbosity.Normal and reward > 0): + sign = ['-', '+'][reward > 0] + + print(f" {sign} t={t} {action_style} r={reward} cum_reward:{total_reward} " + f"a={action_metadata}-{gym_action} " + f"creds={len(observation['credential_cache_matrix'])} " + f" {agent.stateaction_as_string(action_metadata)}") + + if i_episode == episode_count \ + and render_last_episode_rewards_to is not None \ + and reward > 0: + fig = cyberbattle_gym_env.render_as_fig() + fig.write_image(f"{render_last_episode_rewards_to}-e{i_episode}-{render_file_index}.png") + render_file_index += 1 + + agent.end_of_iteration(t, done) + + if done: + episode_ended_at = t + bar.finish(dirty=True) + break + + sys.stdout.flush() + + loss_string = agent.loss_as_string() + if loss_string: + loss_string = "loss={loss_string}" + + if episode_ended_at: + print(f" Episode {i_episode} ended at t={episode_ended_at} {loss_string}") + else: + print(f" Episode {i_episode} stopped at t={iteration_count} {loss_string}") + + print_stats(stats) + + all_episodes_rewards.append(all_rewards) + all_episodes_availability.append(all_availability) + + length = episode_ended_at if episode_ended_at else iteration_count + agent.end_of_episode(i_episode=i_episode, t=length) + if plot_episodes_length: + plottraining.episode_done(length) + if render: + wrapped_env.render() + + if epsilon_multdecay: + epsilon = max(epsilon_minimum, epsilon * epsilon_multdecay) + + wrapped_env.close() + print("simulation ended") + if plot_episodes_length: + plottraining.plot_end() + + return TrainedAgent( + all_episodes_rewards=all_episodes_rewards, + all_episodes_availability=all_episodes_availability, + agent=agent, + trained_on=cyberbattle_gym_env.name, + title=plot_title + ) \ No newline at end of file diff --git a/Eligibility Trace agents/Sarsa(lambda)/plotting.py b/Eligibility Trace agents/Sarsa(lambda)/plotting.py new file mode 100644 index 00000000..e51b4a55 --- /dev/null +++ b/Eligibility Trace agents/Sarsa(lambda)/plotting.py @@ -0,0 +1,203 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Plotting helpers for agent banchmarking""" + +import matplotlib.pyplot as plt # type:ignore +import numpy as np + + +def new_plot(title): + """Prepare a new plot of cumulative rewards""" + plt.figure(figsize=(10, 8)) + plt.ylabel('cumulative reward', fontsize=20) + plt.xlabel('step', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title(title, fontsize=12) + + +def pad(array, length): + """Pad an array with 0s to make it of desired length""" + padding = np.zeros((length,)) + padding[:len(array)] = array + return padding + + +def plot_episodes_rewards_averaged(results): + """Plot cumulative rewards for a given set of specified episodes""" + max_iteration_count = np.max([len(r) for r in results['all_episodes_rewards']]) + + all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in results['all_episodes_rewards']] + cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1) + avg = np.average(cumrewards, axis=0) + std = np.std(cumrewards, axis=0) + x = [i for i in range(len(std))] + plt.plot(x, avg, label=results['title']) + plt.fill_between(x, avg - std, avg + std, alpha=0.5) + + +def fill_with_latest_value(array, length): + pad = length - len(array) + if pad > 0: + return np.pad(array, (0, pad), mode='edge') + else: + return array + + +def plot_episodes_availability_averaged(results): + """Plot availability for a given set of specified episodes""" + data = results['all_episodes_availability'] + longest_episode_length = np.max([len(r) for r in data]) + + all_episodes_padded = [fill_with_latest_value(av, longest_episode_length) for av in data] + avg = np.average(all_episodes_padded, axis=0) + std = np.std(all_episodes_padded, axis=0) + x = [i for i in range(len(std))] + plt.plot(x, avg, label=results['title']) + plt.fill_between(x, avg - std, avg + std, alpha=0.5) + + +def plot_episodes_length(learning_results): + """Plot length of every episode""" + plt.figure(figsize=(10, 8)) + plt.ylabel('#iterations', fontsize=20) + plt.xlabel('episode', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title("Length of each episode", fontsize=12) + + for results in learning_results: + iterations = [len(e) for e in results['all_episodes_rewards']] + episode = [i for i in range(len(results['all_episodes_rewards']))] + plt.plot(episode, iterations, label=f"{results['title']}") + + plt.legend(loc="upper right") + plt.show() + + +def plot_each_episode(results): + """Plot cumulative rewards for each episode""" + for i, episode in enumerate(results['all_episodes_rewards']): + cumrewards = np.cumsum(episode) + x = [i for i in range(len(cumrewards))] + plt.plot(x, cumrewards, label=f'Episode {i}') + + +def plot_all_episodes(r): + """Plot cumulative rewards for every episode""" + new_plot(r['title']) + plot_each_episode(r) + plt.legend(loc="lower right") + plt.show() + + +def plot_averaged_cummulative_rewards(title, all_runs): + """Plot averaged cumulative rewards""" + new_plot(title) + for r in all_runs: + plot_episodes_rewards_averaged(r) + plt.legend(loc="lower right") + plt.show() + + +def plot_averaged_availability(title, all_runs): + """Plot averaged network availability""" + plt.figure(figsize=(10, 8)) + plt.ylabel('network availability', fontsize=20) + plt.xlabel('step', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title(title, fontsize=12) + for r in all_runs: + plot_episodes_availability_averaged(r) + plt.legend(loc="lower right") + plt.show() + + +def new_plot_loss(): + """Plot MSE loss averaged over all episodes""" + plt.figure(figsize=(10, 8)) + plt.ylabel('loss', fontsize=20) + plt.xlabel('episodes', fontsize=20) + plt.xticks(size=12) + plt.yticks(size=20) + plt.title("Loss", fontsize=12) + + +def plot_all_episodes_loss(all_episodes_losses, name, label): + """Plot loss for one learning episode""" + x = [i for i in range(len(all_episodes_losses))] + plt.plot(x, all_episodes_losses, label=f'{name} {label}') + + +def running_mean(x, size): + """return moving average of x for a window of lenght 'size'""" + cumsum = np.cumsum(np.insert(x, 0, 0)) + return (cumsum[size:] - cumsum[:-size]) / float(size) + + +class PlotTraining: + """Plot training-related stats""" + + def __init__(self, title, render_each_episode): + self.episode_durations = [] + self.title = title + self.render_each_episode = render_each_episode + + def plot_durations(self, average_window=5): + # plt.figure(2) + plt.figure() + # plt.clf() + durations_t = np.array(self.episode_durations, dtype=np.float32) + plt.title('Training...') + plt.xlabel('Episode') + plt.ylabel('Duration') + plt.title(self.title, fontsize=12) + + episodes = [i + 1 for i in range(len(self.episode_durations))] + plt.plot(episodes, durations_t) + # plot episode running averages + if len(durations_t) >= average_window: + means = running_mean(durations_t, average_window) + means = np.concatenate((np.zeros(average_window - 1), means)) + plt.plot(episodes, means) + + # display.display(plt.gcf()) + plt.show() + + def episode_done(self, length): + self.episode_durations.append(length) + if self.render_each_episode: + self.plot_durations() + + def plot_end(self): + self.plot_durations() + plt.ioff() # type: ignore + # plt.show() + + +def length_of_all_episodes(run): + """Get the length of every episode""" + return [len(e) for e in run['all_episodes_rewards']] + + +def reduce(x, desired_width): + return [np.average(c) for c in np.array_split(x, desired_width)] + + +def episodes_rewards_averaged(run): + """Plot cumulative rewards for a given set of specified episodes""" + max_iteration_count = np.max([len(r) for r in run['all_episodes_rewards']]) + all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in run['all_episodes_rewards']] + cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1) + avg = np.average(cumrewards, axis=0) + return list(avg) + + +def episodes_lengths_for_all_runs(all_runs): + return [length_of_all_episodes(run) for run in all_runs] + + +def averaged_cummulative_rewards(all_runs, width): + return [reduce(episodes_rewards_averaged(run), width) for run in all_runs] diff --git a/Eligibility Trace agents/Watkins Q/.DS_Store b/Eligibility Trace agents/Watkins Q/.DS_Store new file mode 100644 index 00000000..77ff7bfb Binary files /dev/null and b/Eligibility Trace agents/Watkins Q/.DS_Store differ diff --git a/Eligibility Trace agents/Watkins Q/agent_watkins_q.py b/Eligibility Trace agents/Watkins Q/agent_watkins_q.py new file mode 100644 index 00000000..061493fa --- /dev/null +++ b/Eligibility Trace agents/Watkins Q/agent_watkins_q.py @@ -0,0 +1,303 @@ +import random +from typing import NamedTuple, Optional, Tuple, Union, List +import numpy as np +from numpy import ndarray +import logging +import boolean + +from cyberbattle._env import cyberbattle_env +from agent_wrapper import EnvironmentBounds, discovered_nodes_notowned +from gym import spaces, Wrapper + +import agent_wrapper as w +from learner import Learner + +from torch import Tensor +import torch + +class StateActionModel: + + def __init__(self, ep: EnvironmentBounds): + self.ep = ep + + self.global_features = w.ConcatFeatures(ep, [ + w.Feature_discovered_not_owned_nodes_sliding(ep), + w.Feature_discovered_credential_count(ep) + ]) + + self.source_node_features = w.ConcatFeatures(ep, [ + w.Feature_active_node_properties(ep), + w.Feature_success_actions_at_node(ep) + ]) + + self.target_node_features = w.ConcatFeatures(ep, [ + w.Feature_active_node_id(ep) + ]) + + self.state_space = w.ConcatFeatures(ep, self.global_features.feature_selection + + self.source_node_features.feature_selection + + self.target_node_features.feature_selection) + + self.action_space = w.AbstractAction(ep) + + def valid_actions(self, wrapped_env: w.AgentWrapper, observation): + """returns a list of valid actions and the nodes they can be carried out from""" + + nodes_and_actions = [] + discovered_nodes = np.union1d(w.owned_nodes(observation), w.discovered_nodes_notowned(observation)) + + for from_node in w.owned_nodes(observation): + for local_action in range(self.action_space.n_local_actions): + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, local_action, None) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, local_action, -1)) + + for remote_action in range(self.action_space.n_local_actions, self.action_space.n_local_actions + self.action_space.n_remote_actions): + for target_node in discovered_nodes: + if target_node != from_node: + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, remote_action, target_node) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, remote_action, target_node)) + + for connect_action in range(self.action_space.n_local_actions + self.action_space.n_remote_actions, self.action_space.n_actions): + trial_action = self.action_space.abstract_to_gymaction(from_node, observation, connect_action, None) + if trial_action and wrapped_env.env.is_action_valid(trial_action, observation['action_mask']): + nodes_and_actions.append((from_node, connect_action, -1)) + + return nodes_and_actions + +class Memory: + + def __init__(self, ep:EnvironmentBounds, hash_size): + self.hash_size = hash_size + + self.memory = torch.zeros([2, hash_size], dtype=torch.float64) + + def state_action_index(self, state_space, abstract_action): + """Turns a state action pair into an index for the memory tensor""" + feature_vector = np.append(state_space, abstract_action) + hash_number = abs(hash(str(feature_vector))) + + return hash_number % self.hash_size + +class ChosenActionMetadata(NamedTuple): + + abstract_action: np.int32 + actor_node: int + actor_features: ndarray + actor_state: ndarray + + def __repr__(self) -> str: + return f"[abstract_action={self.abstract_action}, actor={self.actor_node}, state={self.actor_state}]" + +class WatkinsQPolicy(Learner): + + def __init__(self, + ep: EnvironmentBounds, + gamma: float, + λ: float, + learning_rate: float, + hash_size: int + ): + + self.model = StateActionModel(ep) + self.n_local_actions = ep.local_attacks_count + self.n_remote_actions = ep.remote_attacks_count + self.n_actions = self.n_local_actions + self.n_remote_actions + ep.port_count + self.gamma = gamma + self.λ = λ + self.learning_rate = learning_rate + self.hash_size = hash_size + + self.memory = Memory(ep, hash_size) + + def parameters_as_string(self): + return f'γ={self.gamma}, lr={self.learning_rate}, λ={self.λ},\n' + + def all_parameters_as_string(self) -> str: + model = self.model + return f'{self.parameters_as_string()}\n' \ + f'dimension={model.state_space.flat_size()}x{model.action_space.flat_size()}, ' \ + f'Q={[f.name() for f in model.state_space.feature_selection]} ' \ + f"-> 'abstract_action'" + + def get_actor_state_vector(self, global_state: ndarray, actor_features: ndarray, target_features: Optional[ndarray]) -> ndarray: + """Turns seperate state features into one vector""" + if target_features is None: + return np.concatenate((np.array(global_state, dtype=np.float32), + np.array(actor_features, dtype=np.float32))) + else: + return np.concatenate((np.array(global_state, dtype=np.float32), + np.array(actor_features, dtype=np.float32), + np.array(target_features, dtype=np.float32))) + + def update_memory(self, + reward: float, + actor_state: ndarray, + abstract_action: int, + next_actor_state: Optional[ndarray], + next_abstract_action: Optional[int], + chosen_action_is_max = boolean): + + current_state_action_index = self.memory.state_action_index(actor_state, abstract_action) + if next_actor_state is None: + δ = reward - self.memory.memory[0][current_state_action_index].item() + else: + next_state_action_index = self.memory.state_action_index(next_actor_state, next_abstract_action) + δ = reward + (self.gamma * self.memory.memory[0][next_state_action_index].item()) - self.memory.memory[0][current_state_action_index].item() + + self.memory.memory[1][current_state_action_index] += 1 + + non_zero_indicies_q = torch.argwhere(self.memory.memory[0]).numpy() + non_zero_indicies_e = torch.argwhere(self.memory.memory[1]).numpy() + non_zero_indicies = np.union1d(non_zero_indicies_q, non_zero_indicies_e) + + for i in non_zero_indicies: + + self.memory.memory[0][i] = self.memory.memory[0][i].item() + float(self.learning_rate * δ * self.memory.memory[1][i].item()) + self.memory.memory[0][i] = round(self.memory.memory[0][i].item(), 5) + self.memory.memory[0][i] = max(0, self.memory.memory[0][i].item()) + self.memory.memory[0][i] = min(100, self.memory.memory[0][i].item()) + + if chosen_action_is_max: + self.memory.memory[1][i] = self.memory.memory[1][i].item() * float(self.gamma * self.λ) + self.memory.memory[1][i] = round(self.memory.memory[0][i].item(), 5) + else: + self.memory.memory[1][i] = 0 + + def on_step(self, wrapped_env: w.AgentWrapper, + observation, reward: float, done: bool, action_metadata, epsilon): + + if done: + self.update_memory(reward, + actor_state=action_metadata.actor_state, + abstract_action=action_metadata.abstract_action, + next_actor_state=None, + next_abstract_action=None, + chosen_action_is_max=False + ) + else: + x = np.random.rand() + if x <= epsilon: + _, _, chosen_next_action_metadata = self.explore(wrapped_env) + else: + _, _, chosen_next_action_metadata = self.exploit(wrapped_env, observation) + + chosen_action_pair = ((list(chosen_next_action_metadata.actor_state), chosen_next_action_metadata.abstract_action)) + max_action_pairs = self.max_action_in_state(wrapped_env, observation) + + if chosen_action_pair in max_action_pairs: + next_action_pair = chosen_action_pair + chosen_action_is_max = True + else: + next_action_pair = random.choice(max_action_pairs) + chosen_action_is_max = False + + self.update_memory(reward, + actor_state=action_metadata.actor_state, + abstract_action=action_metadata.abstract_action, + next_actor_state=next_action_pair[0], + next_abstract_action=next_action_pair[1], + chosen_action_is_max=chosen_action_is_max + ) + + + def new_episode(self): + torch.mul(self.memory.memory[1], 0) + + def end_of_episode(self, i_episode, t): + return None + + def end_of_iteration(self, t, done): + return None + + def metadata_from_gymaction(self, wrapped_env, gym_action): + """Takes in a gym action and returns it's metadata""" + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + actor_node = cyberbattle_env.sourcenode_of_action(gym_action) + actor_features = self.model.source_node_features.get(wrapped_env.state, actor_node) + abstract_action = self.model.action_space.abstract_from_gymaction(gym_action) + + if 'remote_vulnerability' in gym_action: + target_node = self.model.target_node_features.get(wrapped_env.state, gym_action['remote_vulnerability'][1]) + else: + target_node = None + + return ChosenActionMetadata( + abstract_action=abstract_action, + actor_node=actor_node, + actor_features=actor_features, + actor_state=self.get_actor_state_vector(current_global_state, actor_features, target_node)) + + def stateaction_as_string(self, action_metadata) -> str: + return '' + + def explore(self, wrapped_env: w.AgentWrapper + ) -> Tuple[str, cyberbattle_env.Action, object]: + + gym_action = wrapped_env.env.sample_valid_action(kinds=[0, 1, 2]) + metadata = self.metadata_from_gymaction(wrapped_env, gym_action) + return gym_action, metadata + + def exploit(self, wrapped_env: w.AgentWrapper, observation) -> Tuple[str, Optional[cyberbattle_env.Action], object]: + + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation) + + #The q_values are the estimated returns from an action taken in the current state + q_values = [] + for item in valid_nodes_and_actions: + source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0]) + + if item[1] < self.n_local_actions or item[1] - self.n_local_actions > self.n_remote_actions: + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None) + else: + target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2]) + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features) + + action_state_index = self.memory.state_action_index(actor_state_vector, item[1]) + + q_values.append(self.memory.memory[0][action_state_index].item()) + + indicies_of_chosen_actions = [i for i, x in enumerate(q_values) if x == max(q_values)] + chosen_action_index = random.choice(indicies_of_chosen_actions) + chosen_action = valid_nodes_and_actions[chosen_action_index] + + if chosen_action[1] < self.n_local_actions or chosen_action[1] - self.n_local_actions > self.n_remote_actions: + gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], None) + else: + gym_action = self.model.action_space.abstract_to_gymaction(chosen_action[0], observation, chosen_action[1], chosen_action[2]) + + metadata = self.metadata_from_gymaction(wrapped_env, gym_action) + + return gym_action, metadata + + def max_action_in_state(self, wrapped_env: w.AgentWrapper, observation): + current_global_state = self.model.global_features.get(wrapped_env.state, node=None) + valid_nodes_and_actions = self.model.valid_actions(wrapped_env, observation) + + q_values = [] + states_and_actions = [] + for item in valid_nodes_and_actions: + source_node_features = self.model.source_node_features.get(wrapped_env.state, item[0]) + + if item[1] < self.n_local_actions: + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None) + elif item[1] - self.n_local_actions < self.n_remote_actions: + target_node_features = self.model.target_node_features.get(wrapped_env.state, item[2]) + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, target_node_features) + else: + actor_state_vector = self.get_actor_state_vector(current_global_state, source_node_features, None) + + action_state_index = self.memory.state_action_index(actor_state_vector, item[1]) + + q_values.append(self.memory.memory[0][action_state_index].item()) + states_and_actions.append((list(actor_state_vector), item[1])) + + indicies_of_chosen_actions = [i for i, x in enumerate(q_values) if x == max(q_values)] + to_return = [] + for i in range(len(states_and_actions)): + if i in indicies_of_chosen_actions: + to_return.append(states_and_actions[i]) + + return to_return \ No newline at end of file diff --git a/Eligibility Trace agents/Watkins Q/agent_wrapper.py b/Eligibility Trace agents/Watkins Q/agent_wrapper.py new file mode 100644 index 00000000..985aa1e9 --- /dev/null +++ b/Eligibility Trace agents/Watkins Q/agent_wrapper.py @@ -0,0 +1,342 @@ +from cyberbattle._env.cyberbattle_env import EnvironmentBounds +from typing import Optional, List +import enum +import numpy as np +from gym import spaces, Wrapper +from numpy import ndarray +import cyberbattle._env.cyberbattle_env as cyberbattle_env +import logging + + +class StateAugmentation: + """Default agent state augmentation, consisting of the gym environment + observation itself and nothing more.""" + + def __init__(self, observation: cyberbattle_env.Observation): + self.observation = observation + + def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation): + self.observation = observation + + def on_reset(self, observation: cyberbattle_env.Observation): + self.observation = observation + + +class Feature(spaces.MultiDiscrete): + """ + Feature consisting of multiple discrete dimensions. + Parameters: + nvec: is a vector defining the number of possible values + for each discrete space. + """ + + def __init__(self, env_properties: EnvironmentBounds, nvec): + self.env_properties = env_properties + super().__init__(nvec) + + def flat_size(self): + return np.prod(self.nvec) + + def name(self): + """Return the name of the feature""" + p = len(type(Feature(self.env_properties, [])).__name__) + 1 + return type(self).__name__[p:] + + def get(self, a: StateAugmentation, node: Optional[int]) -> np.ndarray: + """Compute the current value of a feature value at + the current observation and specific node""" + raise NotImplementedError + + def pretty_print(self, v): + return v + +class Feature_active_node_properties(Feature): + """Bitmask of all properties set for the active node""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [2] * p.property_count) + + def get(self, a: StateAugmentation, node) -> ndarray: + assert node is not None, 'feature only valid in the context of a node' + + node_prop = a.observation['discovered_nodes_properties'] + + # list of all properties set/unset on the node + # Remap to get rid of unknown value 0: 1 -> 1, and -1 -> 0 (and 0-> 0) + assert node < len(node_prop), f'invalid node index {node} (not discovered yet)' + remapped = np.array((1 + node_prop[node]) / 2, dtype=int) + return remapped + + +class Feature_active_node_id(Feature): + """Return the node id itself""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count] * 1) + + def get(self, a: StateAugmentation, node) -> ndarray: + return np.array([node], dtype=int) + + +class Feature_discovered_credential_count(Feature): + """number of credentials discovered so far""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_total_credentials + 1]) + + def get(self, a: StateAugmentation, node): + return [len(a.observation['credential_cache_matrix'])] + + +class Feature_discovered_not_owned_nodes_sliding(Feature): + """array of which of discovered nodes not owned by name""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count + 1]) + + def get(self, a: StateAugmentation, node): + discovered = a.observation['discovered_nodes'] + levels = a.observation['nodes_privilegelevel'] + owned_nodes_indices = np.where(levels > 0)[0] + owned = [] + for i in owned_nodes_indices: + owned.append(discovered[i]) + discovered_not_owned = [] + for node in discovered: + if node not in owned: + discovered_not_owned.append(node) + discovered_not_owned_sliding = np.zeros(self.env_properties.maximum_node_count, np.int32) + for node_id in discovered_not_owned: + if node_id == 'client': + discovered_not_owned_sliding[0] = 1 + elif node_id == 'Website': + discovered_not_owned_sliding[1] = 1 + elif node_id == 'Website.Directory': + discovered_not_owned_sliding[2] = 1 + elif node_id == 'Website[user=monitor]': + discovered_not_owned_sliding[3] = 1 + elif node_id == 'GitHubProject': + discovered_not_owned_sliding[4] = 1 + elif node_id == 'AzureStorage': + discovered_not_owned_sliding[5] = 1 + elif node_id == 'Sharepoint': + discovered_not_owned_sliding[6] = 1 + elif node_id == 'AzureResourceManager': + discovered_not_owned_sliding[7] = 1 + elif node_id == 'AzureResourceManager[user-monitor]': + discovered_not_owned_sliding[8] = 1 + elif node_id == 'AzureVM': + discovered_not_owned_sliding[9] = 1 + return discovered_not_owned_sliding + +class Feature_active_node_id(Feature): + """number asigned to each type of node in toy-ctf""" + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [p.maximum_node_count + 1]) + + def get(self, a: StateAugmentation, node): + node_id = a.observation['discovered_nodes'][node] + + node_id_array = np.zeros(1, np.int32) + if node_id == 'client': + node_id_array[0] = 0 + elif node_id == 'Website': + node_id_array[0] = 1 + elif node_id == 'Website.Directory': + node_id_array[0] = 2 + elif node_id == 'Website[user=monitor]': + node_id_array[0] = 3 + elif node_id == 'GitHubProject': + node_id_array[0] = 4 + elif node_id == 'AzureStorage': + node_id_array[0] = 5 + elif node_id == 'Sharepoint': + node_id_array[0] = 6 + elif node_id == 'AzureResourceManager': + node_id_array[0] = 7 + elif node_id == 'AzureResourceManager[user-monitor]': + node_id_array[0] = 8 + elif node_id == 'AzureVM': + node_id_array[0] = 9 + else: + node_id_array[0] = 10 + return node_id_array + + +class ConcatFeatures(Feature): + """ Concatenate a list of features into a single feature + Parameters: + feature_selection - a selection of features to combine + """ + + def __init__(self, p: EnvironmentBounds, feature_selection: List[Feature]): + self.feature_selection = feature_selection + self.dim_sizes = np.concatenate([f.nvec for f in feature_selection]) + super().__init__(p, [self.dim_sizes]) + + def pretty_print(self, v): + return v + + def get(self, a: StateAugmentation, node=None) -> np.ndarray: + """Return the feature vector""" + feature_vector = [f.get(a, node) for f in self.feature_selection] + return np.concatenate(feature_vector) + + +def owned_nodes(observation): + """Return the list of owned nodes""" + return np.nonzero(observation['nodes_privilegelevel'])[0] + + +def discovered_nodes_notowned(observation): + """Return the list of discovered nodes that are not owned yet""" + return np.nonzero(observation['nodes_privilegelevel'] == 0)[0] + + +class AbstractAction(Feature): + """An abstraction of the gym state space that reduces + the space dimension for learning use to just + - local_attack(vulnid) (source_node provided) + - remote_attack(vulnid) (source_node provided, target_node forgotten) + - connect(port) (source_node provided, target_node forgotten, credentials infered from cache) + """ + + def __init__(self, p: EnvironmentBounds): + self.n_local_actions = p.local_attacks_count + self.n_remote_actions = p.remote_attacks_count + self.n_connect_actions = p.port_count + self.n_actions = self.n_local_actions + self.n_remote_actions + self.n_connect_actions + super().__init__(p, [self.n_actions]) + + def abstract_to_gymaction(self, source_node, observation, abstract_action, target_node): + """Takes a statring node and an abstract action number and returns a gym action""" + + if abstract_action < self.n_local_actions: + vuln = abstract_action + return {'local_vulnerability': np.array([source_node, vuln])} + + node_prop = observation['discovered_nodes_properties'] + abstract_action -= self.n_local_actions + if abstract_action < self.n_remote_actions: + vuln = abstract_action + + discovered_nodes_count = len(node_prop) + if discovered_nodes_count <= 1: + return None + + return {'remote_vulnerability': np.array([source_node, target_node, vuln])} + + abstract_action -= self.n_remote_actions + port = np.int32(abstract_action) + + discovered_credentials = np.array(observation['credential_cache_matrix']) + n_discovered_creds = len(discovered_credentials) + if n_discovered_creds <= 0: + return None + + nodes_not_owned = discovered_nodes_notowned(observation) + match_port = discovered_credentials[:, 1] == port + match_port_indicies = np.where(match_port)[0] + + credential_indices_choices = [c for c in match_port_indicies + if discovered_credentials[c, 0] in nodes_not_owned] + + if credential_indices_choices: + logging.debug('found matching cred in the credential cache') + else: + logging.debug('no cred matching requested port, trying instead creds used to access other ports') + credential_indices_choices = [i for (i, n) in enumerate(discovered_credentials[:, 0]) + if n in nodes_not_owned] + + if credential_indices_choices: + logging.debug('found cred in the credential cache without matching port name') + else: + logging.debug('no cred to use from the credential cache') + return None + + cred = np.int32(np.random.choice(credential_indices_choices)) + target = np.int32(discovered_credentials[cred, 0]) + return {'connect': np.array([source_node, target, port, cred], dtype=np.int32)} + + def abstract_from_gymaction(self, gym_action: cyberbattle_env.Action) -> np.int32: + """Turns a gym action into it's abstract action number""" + if 'local_vulnerability' in gym_action: + return gym_action['local_vulnerability'][1] + elif 'remote_vulnerability' in gym_action: + r = gym_action['remote_vulnerability'] + return self.n_local_actions + r[2] + + assert 'connect' in gym_action + c = gym_action['connect'] + + a = self.n_local_actions + self.n_remote_actions + c[2] + assert a < self.n_actions + return np.int32(a) + + +class ActionTrackingStateAugmentation(StateAugmentation): + """An agent state augmentation consisting of + the environment observation augmented with the following dynamic information: + - success_action_count: count of action taken and succeeded at the current node + - failed_action_count: count of action taken and failed at the current node + """ + + def __init__(self, p: EnvironmentBounds, observation: cyberbattle_env.Observation): + self.aa = AbstractAction(p) + self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.env_properties = p + super().__init__(observation) + + def on_step(self, action: cyberbattle_env.Action, reward: float, done: bool, observation: cyberbattle_env.Observation): + node = cyberbattle_env.sourcenode_of_action(action) + abstract_action = self.aa.abstract_from_gymaction(action) + if reward > 0: + self.success_action_count[node, abstract_action] += 1 + else: + self.failed_action_count[node, abstract_action] += 1 + super().on_step(action, reward, done, observation) + + def on_reset(self, observation: cyberbattle_env.Observation): + p = self.env_properties + self.success_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + self.failed_action_count = np.zeros(shape=(p.maximum_node_count, self.aa.n_actions), dtype=np.int32) + super().on_reset(observation) + + +class Feature_success_actions_at_node(Feature): + """number of time each action succeeded at a given node""" + + max_action_count = 100 + + def __init__(self, p: EnvironmentBounds): + super().__init__(p, [self.max_action_count] * AbstractAction(p).n_actions) + + def get(self, a: ActionTrackingStateAugmentation, node: int): + return np.minimum(a.success_action_count[node, :], self.max_action_count - 1) + + +class Verbosity(enum.Enum): + """Verbosity of the learning function""" + Quiet = 0 + Normal = 1 + Verbose = 2 + + +class AgentWrapper(Wrapper): + """Gym wrapper to update the agent state on every step""" + + def __init__(self, env: cyberbattle_env.CyberBattleEnv, state: StateAugmentation): + super().__init__(env) + self.state = state + + def step(self, action: cyberbattle_env.Action): + observation, reward, done, info = self.env.step(action) + self.state.on_step(action, reward, done, observation) + return observation, reward, done, info + + def reset(self): + observation = self.env.reset() + self.state.on_reset(observation) + return observation \ No newline at end of file diff --git a/Eligibility Trace agents/Watkins Q/learner.py b/Eligibility Trace agents/Watkins Q/learner.py new file mode 100644 index 00000000..afaefd1f --- /dev/null +++ b/Eligibility Trace agents/Watkins Q/learner.py @@ -0,0 +1,270 @@ +import math +import sys + +from plotting import PlotTraining, plot_averaged_cummulative_rewards +from agent_wrapper import AgentWrapper, EnvironmentBounds, Verbosity, ActionTrackingStateAugmentation +import logging +import numpy as np +from cyberbattle._env import cyberbattle_env +from typing import Tuple, Optional, TypedDict, List +import progressbar +import abc + +class Agent(abc.ABC): + + @abc.abstractmethod + def explore(self, wrapped_env: AgentWrapper) -> Tuple[cyberbattle_env.Action, object]: + """Exploration function. + Returns (action_type, gym_action, action_metadata) where + action_metadata is a custom object that gets passed to the on_step callback function""" + raise NotImplementedError + + @abc.abstractmethod + def exploit(self, wrapped_env: AgentWrapper, observation) -> Tuple[Optional[cyberbattle_env.Action], object]: + """Exploit function. + Returns (action_type, gym_action, action_metadata) where + action_metadata is a custom object that gets passed to the on_step callback function""" + raise NotImplementedError + + @abc.abstractmethod + def on_step(self, wrapped_env: AgentWrapper, observation, reward, done, action_metadata, epsilon) -> None: + raise NotImplementedError + + def parameters_as_string(self) -> str: + return '' + + def all_parameters_as_string(self) -> str: + return '' + + def loss_as_string(self) -> str: + return '' + + def stateaction_as_string(self, action_metadata) -> str: + return '' + +Breakdown = TypedDict('Breakdown', { + 'local': int, + 'remote': int, + 'connect': int +}) + +Outcomes = TypedDict('Outcomes', { + 'reward': Breakdown, + 'noreward': Breakdown +}) + +Stats = TypedDict('Stats', { + 'exploit': Outcomes, + 'explore': Outcomes +}) + +TrainedAgent = TypedDict('TrainedAgent', { + 'all_episodes_rewards': List[List[float]], + 'all_episodes_availability': List[List[float]], + 'agent': Agent, + 'trained_on': str, + 'title': str +}) + +def print_stats(stats): + """Print learning statistics""" + def print_breakdown(stats, actiontype: str): + def ratio(kind: str) -> str: + x, y = stats[actiontype]['reward'][kind], stats[actiontype]['noreward'][kind] + sum = x + y + if sum == 0: + return 'NaN' + else: + return f"{(x / sum):.2f}" + + def print_kind(kind: str): + print( + f" {actiontype}-{kind}: {stats[actiontype]['reward'][kind]}/{stats[actiontype]['noreward'][kind]} " + f"({ratio(kind)})") + print_kind('local') + print_kind('remote') + print_kind('connect') + + print(" Breakdown [Reward/NoReward (Success rate)]") + print_breakdown(stats, 'explore') + print_breakdown(stats, 'exploit') + +def epsilon_greedy_search( + cyberbattle_gym_env: cyberbattle_env.CyberBattleEnv, + environment_properties: EnvironmentBounds, + agent: Agent, + title: str, + episode_count: int, + iteration_count: int, + epsilon: float, + epsilon_minimum=0.0, + epsilon_multdecay: Optional[float] = None, + epsilon_exponential_decay: Optional[int] = None, + render=True, + render_last_episode_rewards_to: Optional[str] = None, + verbosity: Verbosity = Verbosity.Normal, + plot_episodes_length=True +) -> TrainedAgent: + + print(f"###### {title}\n" + f"Learning with: episode_count={episode_count}," + f"iteration_count={iteration_count}," + f"ϵ={epsilon}," + f'ϵ_min={epsilon_minimum}, ' + + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') + + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') + + f"{agent.parameters_as_string()}") + + initial_epsilon = epsilon + + all_episodes_rewards = [] + all_episodes_availability = [] + + wrapped_env = AgentWrapper(cyberbattle_gym_env, + ActionTrackingStateAugmentation(environment_properties, cyberbattle_gym_env.reset())) + + steps_done = 0 + plot_title = f"{title} (epochs={episode_count}, ϵ={initial_epsilon}, ϵ_min={epsilon_minimum}," \ + + (f"ϵ_multdecay={epsilon_multdecay}," if epsilon_multdecay else '') \ + + (f"ϵ_expdecay={epsilon_exponential_decay}," if epsilon_exponential_decay else '') \ + + agent.parameters_as_string() + plottraining = PlotTraining(title=plot_title, render_each_episode=render) + + render_file_index = 1 + + for i_episode in range(1, episode_count + 1): + + print(f" ## Episode: {i_episode}/{episode_count} '{title}' " + f"ϵ={epsilon:.4f}, " + f"{agent.parameters_as_string()}") + + observation = wrapped_env.reset() + total_reward = 0.0 + all_rewards = [] + all_availability = [] + agent.new_episode() + + stats = Stats(exploit=Outcomes(reward=Breakdown(local=0, remote=0, connect=0), + noreward=Breakdown(local=0, remote=0, connect=0)), + explore=Outcomes(reward=Breakdown(local=0, remote=0, connect=0), + noreward=Breakdown(local=0, remote=0, connect=0)) + ) + + episode_ended_at = None + sys.stdout.flush() + + bar = progressbar.ProgressBar( + widgets=[ + 'Episode ', + f'{i_episode}', + '|Iteration ', + progressbar.Counter(), + '|', + progressbar.Variable(name='reward', width=6, precision=10), + '|', + progressbar.Variable(name='last_reward_at', width=4), + '|', + progressbar.Timer(), + progressbar.Bar() + ], + redirect_stdout=False) + + for t in bar(range(1, 1 + iteration_count)): + + if epsilon_exponential_decay: + epsilon = epsilon_minimum + math.exp(-1. * steps_done / + epsilon_exponential_decay) * (initial_epsilon - epsilon_minimum) + + steps_done += 1 + + x = np.random.rand() + if x <= epsilon: + gym_action, action_metadata = agent.explore(wrapped_env) + action_style = "explore" + else: + gym_action, action_metadata = agent.exploit(wrapped_env, observation) + action_style = "exploit" + + # Take the step + logging.debug(f"gym_action={gym_action}, action_metadata={action_metadata}") + observation, reward, done, info = wrapped_env.step(gym_action) + + action_type = 'exploit' if action_style == 'exploit' else 'explore' + outcome = 'reward' if reward > 0 else 'noreward' + if 'local_vulnerability' in gym_action: + stats[action_type][outcome]['local'] += 1 + elif 'remote_vulnerability' in gym_action: + stats[action_type][outcome]['remote'] += 1 + else: + stats[action_type][outcome]['connect'] += 1 + + agent.on_step(wrapped_env, observation, reward, done, action_metadata, epsilon) + assert np.shape(reward) == () + + all_rewards.append(reward) + all_availability.append(info['network_availability']) + total_reward += reward + bar.update(t, reward=total_reward) + if reward > 0: + bar.update(t, last_reward_at=t) + + if verbosity == Verbosity.Verbose or (verbosity == Verbosity.Normal and reward > 0): + sign = ['-', '+'][reward > 0] + + print(f" {sign} t={t} {action_style} r={reward} cum_reward:{total_reward} " + f"a={action_metadata}-{gym_action} " + f"creds={len(observation['credential_cache_matrix'])} " + f" {agent.stateaction_as_string(action_metadata)}") + + if i_episode == episode_count \ + and render_last_episode_rewards_to is not None \ + and reward > 0: + fig = cyberbattle_gym_env.render_as_fig() + fig.write_image(f"{render_last_episode_rewards_to}-e{i_episode}-{render_file_index}.png") + render_file_index += 1 + + agent.end_of_iteration(t, done) + + if done: + episode_ended_at = t + bar.finish(dirty=True) + break + + sys.stdout.flush() + + loss_string = agent.loss_as_string() + if loss_string: + loss_string = "loss={loss_string}" + + if episode_ended_at: + print(f" Episode {i_episode} ended at t={episode_ended_at} {loss_string}") + else: + print(f" Episode {i_episode} stopped at t={iteration_count} {loss_string}") + + print_stats(stats) + + all_episodes_rewards.append(all_rewards) + all_episodes_availability.append(all_availability) + + length = episode_ended_at if episode_ended_at else iteration_count + agent.end_of_episode(i_episode=i_episode, t=length) + if plot_episodes_length: + plottraining.episode_done(length) + if render: + wrapped_env.render() + + if epsilon_multdecay: + epsilon = max(epsilon_minimum, epsilon * epsilon_multdecay) + + wrapped_env.close() + print("simulation ended") + if plot_episodes_length: + plottraining.plot_end() + + return TrainedAgent( + all_episodes_rewards=all_episodes_rewards, + all_episodes_availability=all_episodes_availability, + agent=agent, + trained_on=cyberbattle_gym_env.name, + title=plot_title + ) \ No newline at end of file diff --git a/Eligibility Trace agents/Watkins Q/plotting.py b/Eligibility Trace agents/Watkins Q/plotting.py new file mode 100644 index 00000000..e51b4a55 --- /dev/null +++ b/Eligibility Trace agents/Watkins Q/plotting.py @@ -0,0 +1,203 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Plotting helpers for agent banchmarking""" + +import matplotlib.pyplot as plt # type:ignore +import numpy as np + + +def new_plot(title): + """Prepare a new plot of cumulative rewards""" + plt.figure(figsize=(10, 8)) + plt.ylabel('cumulative reward', fontsize=20) + plt.xlabel('step', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title(title, fontsize=12) + + +def pad(array, length): + """Pad an array with 0s to make it of desired length""" + padding = np.zeros((length,)) + padding[:len(array)] = array + return padding + + +def plot_episodes_rewards_averaged(results): + """Plot cumulative rewards for a given set of specified episodes""" + max_iteration_count = np.max([len(r) for r in results['all_episodes_rewards']]) + + all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in results['all_episodes_rewards']] + cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1) + avg = np.average(cumrewards, axis=0) + std = np.std(cumrewards, axis=0) + x = [i for i in range(len(std))] + plt.plot(x, avg, label=results['title']) + plt.fill_between(x, avg - std, avg + std, alpha=0.5) + + +def fill_with_latest_value(array, length): + pad = length - len(array) + if pad > 0: + return np.pad(array, (0, pad), mode='edge') + else: + return array + + +def plot_episodes_availability_averaged(results): + """Plot availability for a given set of specified episodes""" + data = results['all_episodes_availability'] + longest_episode_length = np.max([len(r) for r in data]) + + all_episodes_padded = [fill_with_latest_value(av, longest_episode_length) for av in data] + avg = np.average(all_episodes_padded, axis=0) + std = np.std(all_episodes_padded, axis=0) + x = [i for i in range(len(std))] + plt.plot(x, avg, label=results['title']) + plt.fill_between(x, avg - std, avg + std, alpha=0.5) + + +def plot_episodes_length(learning_results): + """Plot length of every episode""" + plt.figure(figsize=(10, 8)) + plt.ylabel('#iterations', fontsize=20) + plt.xlabel('episode', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title("Length of each episode", fontsize=12) + + for results in learning_results: + iterations = [len(e) for e in results['all_episodes_rewards']] + episode = [i for i in range(len(results['all_episodes_rewards']))] + plt.plot(episode, iterations, label=f"{results['title']}") + + plt.legend(loc="upper right") + plt.show() + + +def plot_each_episode(results): + """Plot cumulative rewards for each episode""" + for i, episode in enumerate(results['all_episodes_rewards']): + cumrewards = np.cumsum(episode) + x = [i for i in range(len(cumrewards))] + plt.plot(x, cumrewards, label=f'Episode {i}') + + +def plot_all_episodes(r): + """Plot cumulative rewards for every episode""" + new_plot(r['title']) + plot_each_episode(r) + plt.legend(loc="lower right") + plt.show() + + +def plot_averaged_cummulative_rewards(title, all_runs): + """Plot averaged cumulative rewards""" + new_plot(title) + for r in all_runs: + plot_episodes_rewards_averaged(r) + plt.legend(loc="lower right") + plt.show() + + +def plot_averaged_availability(title, all_runs): + """Plot averaged network availability""" + plt.figure(figsize=(10, 8)) + plt.ylabel('network availability', fontsize=20) + plt.xlabel('step', fontsize=20) + plt.xticks(size=20) + plt.yticks(size=20) + plt.title(title, fontsize=12) + for r in all_runs: + plot_episodes_availability_averaged(r) + plt.legend(loc="lower right") + plt.show() + + +def new_plot_loss(): + """Plot MSE loss averaged over all episodes""" + plt.figure(figsize=(10, 8)) + plt.ylabel('loss', fontsize=20) + plt.xlabel('episodes', fontsize=20) + plt.xticks(size=12) + plt.yticks(size=20) + plt.title("Loss", fontsize=12) + + +def plot_all_episodes_loss(all_episodes_losses, name, label): + """Plot loss for one learning episode""" + x = [i for i in range(len(all_episodes_losses))] + plt.plot(x, all_episodes_losses, label=f'{name} {label}') + + +def running_mean(x, size): + """return moving average of x for a window of lenght 'size'""" + cumsum = np.cumsum(np.insert(x, 0, 0)) + return (cumsum[size:] - cumsum[:-size]) / float(size) + + +class PlotTraining: + """Plot training-related stats""" + + def __init__(self, title, render_each_episode): + self.episode_durations = [] + self.title = title + self.render_each_episode = render_each_episode + + def plot_durations(self, average_window=5): + # plt.figure(2) + plt.figure() + # plt.clf() + durations_t = np.array(self.episode_durations, dtype=np.float32) + plt.title('Training...') + plt.xlabel('Episode') + plt.ylabel('Duration') + plt.title(self.title, fontsize=12) + + episodes = [i + 1 for i in range(len(self.episode_durations))] + plt.plot(episodes, durations_t) + # plot episode running averages + if len(durations_t) >= average_window: + means = running_mean(durations_t, average_window) + means = np.concatenate((np.zeros(average_window - 1), means)) + plt.plot(episodes, means) + + # display.display(plt.gcf()) + plt.show() + + def episode_done(self, length): + self.episode_durations.append(length) + if self.render_each_episode: + self.plot_durations() + + def plot_end(self): + self.plot_durations() + plt.ioff() # type: ignore + # plt.show() + + +def length_of_all_episodes(run): + """Get the length of every episode""" + return [len(e) for e in run['all_episodes_rewards']] + + +def reduce(x, desired_width): + return [np.average(c) for c in np.array_split(x, desired_width)] + + +def episodes_rewards_averaged(run): + """Plot cumulative rewards for a given set of specified episodes""" + max_iteration_count = np.max([len(r) for r in run['all_episodes_rewards']]) + all_episodes_rewards_padded = [pad(rewards, max_iteration_count) for rewards in run['all_episodes_rewards']] + cumrewards = np.cumsum(all_episodes_rewards_padded, axis=1) + avg = np.average(cumrewards, axis=0) + return list(avg) + + +def episodes_lengths_for_all_runs(all_runs): + return [length_of_all_episodes(run) for run in all_runs] + + +def averaged_cummulative_rewards(all_runs, width): + return [reduce(episodes_rewards_averaged(run), width) for run in all_runs]