|
| 1 | +""" |
| 2 | +File that contains data frame helpers for the Lacework notebook environment. |
| 3 | +""" |
| 4 | + |
| 5 | +import json |
| 6 | +import logging |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | +import numpy as np |
| 10 | + |
| 11 | +from laceworkjupyter import manager |
| 12 | + |
| 13 | + |
| 14 | +logger = logging.getLogger("lacework_sdk.jupyter.feature.helper") |
| 15 | + |
| 16 | + |
| 17 | +@manager.register_feature |
| 18 | +def deep_extract_field(data_frame, column, field_string, ctx=None): |
| 19 | + """ |
| 20 | + Extract a field from a JSON struct inside a DataFrame. |
| 21 | +
|
| 22 | + Usage example: |
| 23 | + df['hostname'] = lw.deep_extract_field( |
| 24 | + df, 'properties', 'host.hostname') |
| 25 | +
|
| 26 | + :param DataFrame data_frame: The data frame to extract from. |
| 27 | + :param str column: The name of the column that contains the JSON struct. |
| 28 | + :param str field_string: String that contains the field to extract from, |
| 29 | + this is a dot delimited string, eg: key.foo.bar, that will extract |
| 30 | + a value from {'key': 'foo': {'bar': 'value'}}. |
| 31 | + :param obj ctx: The context object. |
| 32 | + :return: A pandas Series with the extracted value. |
| 33 | + """ |
| 34 | + def _extract_function(json_obj, item): |
| 35 | + if isinstance(json_obj, str): |
| 36 | + try: |
| 37 | + json_obj = json.loads(json_obj) |
| 38 | + except json.JSONDecodeError: |
| 39 | + logger.error("Unable to decode JSON string: %s", json_obj) |
| 40 | + return np.nan |
| 41 | + |
| 42 | + if not isinstance(json_obj, dict): |
| 43 | + logger.error("Unable to extract, not a dict: %s", type(json_obj)) |
| 44 | + return np.nan |
| 45 | + |
| 46 | + data = json_obj |
| 47 | + for point in item.split("."): |
| 48 | + if not isinstance(data, dict): |
| 49 | + logger.error( |
| 50 | + "Sub-item %s is not a dict (%s)", point, type(data)) |
| 51 | + return np.nan |
| 52 | + |
| 53 | + data = data.get(point) |
| 54 | + return data |
| 55 | + |
| 56 | + if column not in data_frame: |
| 57 | + logger.error("Column does not exist in the dataframe.") |
| 58 | + return pd.Series() |
| 59 | + |
| 60 | + return data_frame[column].apply( |
| 61 | + lambda x: _extract_function(x, field_string)) |
0 commit comments