Skip to content

Transforms

enrichsdk.contrib.transforms

Standard transforms that can be directly included in any pipeline.

FileOperations(*args, **kwargs)

Bases: FileOperationsBase

FileOperations performs a number of operations on files generated by pipelines.

The transform takes a list of actions. The only action type supported for now is copy. Each copy task requires source, destination, and instruction on what to do with existing file.

Example::

{
        "transform": "FileOperations",
        "enable": true,
        "dependencies": {
           ....
        },
        "args": {
            "actions": [
                  {
                    "action": "copy",
                    "src": "%(output)s/%(runid)s/profile.sqlite",
                    "dst": "%(data_root)s/shared/campaigns/profile_daily/profile.sqlite",
                    "backupsuffix": ".backup"
                  }
             ]
        }
}
Source code in enrichsdk/contrib/transforms/fileops/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "FileOperations"
    self.description = "File operations such as move/copy etc."

    test_root = os.environ["ENRICH_TEST"]
    self.testdata = {
        "data_root": os.path.join(test_root, self.name),
        "outputdir": os.path.join(test_root, self.name, "output"),
        "inputdir": test_root,
        "statedir": os.path.join(test_root, self.name, "state"),
        "global": {"args": {"rundate": "2020-01-10"}},
        "conf": {
            "args": {
                "actions": [
                    {
                        "action": "copy",
                        "src": "%(output)s/%(runid)s/outputs/cars.csv",
                        "dst": "%(data_root)s/shared/%(rundate)s/hello.csv",
                    }
                ]
            }
        },
        "data": {},
    }

instantiable() classmethod

Make this class instantiable

Source code in enrichsdk/contrib/transforms/fileops/__init__.py
@classmethod
def instantiable(cls):
    """
    Make this class instantiable
    """
    return True

JSONSink(*args, **kwargs)

Bases: Sink

Store a 'dict' frame that is present in the state into a file.

Params are meant to be passed as parameter to update_frame.

Example configuration::

 "args": {
     "sink": {
         'test': {
             'frametype': 'dict',
             'filename': '%(output)s/%(runid)s/mytestoutput.json',
             'params': {}
         }
     }
 }
Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "JSONSink"

    self.testdata = {
        "conf": {
            "args": {
                "sink": {
                    "frame1": {
                        "frametype": "dict",
                        "filename": "%(output)s/%(runid)s/mytestoutput.json",
                        "params": {},
                    }
                }
            }
        },
        "data": {
            "frame1": {
                "filename": "outputjson.json",
                "frametype": "dict",
                "transform": "TestJSON",
                "params": {},
            }
        },
    }

preload_clean_args(args)

Clean when the spec is loaded...

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def preload_clean_args(self, args):
    """
    Clean when the spec is loaded...
    """

    if "sink" not in args:
        args = {"sink": args}

    args = super().preload_clean_args(args)

    assert "sink" in args
    assert isinstance(args["sink"], dict)
    assert len(args["sink"]) > 0

    sink = args["sink"]
    for name, detail in sink.items():

        if ("frametype" not in detail) or (detail["frametype"] != "dict"):
            logger.error(
                "Invalid configuration. Only JSON/Dictionaries are supported by this sink transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        filename = detail["filename"]
        if not filename.lower().endswith(".json"):
            logger.error(
                "Input file must a .json file",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        detail["root"] = self.config.enrich_data_dir

        tags = detail.get("tags", [])
        if isinstance(tags, str):
            tags = [tags]
        detail["tags"] = tags

        # => Materialize the path...
        detail["filename"] = self.config.get_file(
            detail["filename"], extra={"frame_name": name}
        )

    return args

process(state)

Store the dictionary 'frames' in the state in files.

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def process(self, state):
    """
    Store the dictionary 'frames' in the state in files.
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    available_frames = state.get_frame_list()

    # => First construct input for the pandasframe
    extra = {}
    args_input = {}
    write_input = {}
    framecls = self.config.get_dataframe("dict")

    sink = self.args["sink"]
    for pattern in sink:
        # The pattern could be precise dataframe name or could be
        # regular expression.
        regex = re.compile("^{}$".format(pattern))
        frames = [
            m.group(0) for f in available_frames for m in [regex.search(f)] if m
        ]
        if len(frames) == 0:
            logger.warning("Pattern has not matched any frames: {}".format(pattern))
            continue

        for f in frames:

            # Get the details of this frame
            detail = state.get_frame(f)

            # Handle frametype
            frametype = detail["frametype"]
            if frametype != "dict":
                logger.warning(
                    "Pattern has matched non-dict frame: {}".format(f),
                    extra=self.config.get_extra({"transform": self.name}),
                )
                continue

            # Now construct the output file name
            filename = sink[pattern]["filename"]
            filename = self.config.get_file(
                filename, create_dir=True, extra={"frame_name": f}
            )

            extra[f] = {
                "notes": self.collapse_notes(detail),
                "descriptions": self.collapse_descriptions(detail),
            }

            params = sink[pattern].get("params", {})
            write_input[f] = {
                "frametype": detail["frametype"],
                "filename": filename,
                "pattern": pattern,
                "df": detail["df"],
                "params": params,
            }

            args_input[f] = copy.copy(sink[pattern])
            args_input[f]["filename"] = filename

    framecls.write(args_input, write_input)

    for name in write_input:

        detail = write_input[name]

        # => Insert columns and tags
        pattern = detail["pattern"]
        detail["params"]["tags"] = sink[pattern]["tags"]

        # Incorporate columns, notes and description
        detail["params"].update(extra[name])

        detail["params"] = [
            detail["params"],
            {
                "type": "lineage",
                "transform": self.name,
                "dependencies": [
                    {"type": "dataframe", "nature": "input", "objects": [name]},
                    {
                        "type": "file",
                        "nature": "output",
                        "objects": [detail["params"]["filename"]],
                    },
                ],
            },
        ]

        # Insert additional detail
        detail["transform"] = self.name
        detail["history"] = [{"transform": self.name, "log": "Wrote output"}]

        state.update_frame(name, detail)

    logger.debug(
        "Finished writing data",
        extra=self.config.get_extra({"transform": self.name}),
    )

    ###########################################
    # => Return
    ###########################################
    return state

validate_args(what, state)

An extra check on the arguments to make sure it is consistent with the specification

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def validate_args(self, what, state):
    """
    An extra check on the arguments to make sure
    it is consistent with the specification
    """
    args = self.args
    assert "sink" in args
    assert isinstance(args["sink"], dict)
    assert len(args["sink"]) > 0

    sink = args["sink"]
    for name, detail in sink.items():
        assert ("frametype" in detail) and (detail["frametype"] == "dict")
        assert "filename" in detail
        assert "params" in detail

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

JSONSource(*args, **kwargs)

Bases: Source

Load a file into a 'dict' frame in the state.

Params are meant to be passed as parameter to update_frame.

Example configuration::

 ...

 "args": {
     "source": {
         'hello': {
             'frametype': 'dict',
             'filename': '%(data_root)s/shared/hello.json',
             'params': {}
         }
     }
 }
Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "JSONSource"

    self.testdata = {
        "conf": {
            "args": {
                "source": {
                    "hello": {
                        "filename": "%(data_root)s/shared/hello.json",
                        "frametype": "dict",
                        "params": {},
                    }
                }
            }
        },
        "data": {},
    }

preload_clean_args(args)

Check if the args are consistent with the specification.

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def preload_clean_args(self, args):
    """
    Check if the args are consistent with the
    specification.
    """

    # Backward compatabilty
    if "source" not in args:
        args = {"source": args}

    args = super().preload_clean_args(args)

    # Sanity check...
    assert isinstance(args, dict)
    assert "source" in args
    assert isinstance(args["source"], dict)

    source = args["source"]
    for name, detail in source.items():

        if ("frametype" not in detail) or (detail["frametype"] != "dict"):
            logger.error(
                "Invalid configuration. Only JSON source supported by this source transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        filename = detail["filename"]
        if not filename.lower().endswith(".json"):
            logger.error(
                "Input file must a .json file",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        # => Materialize the path...
        detail["filename"] = self.config.get_file(detail["filename"])

    return args

process(state)

Load the json files into 'dict' frames and store them in the state.

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def process(self, state):
    """
    Load the json files into 'dict' frames and store them in the state.
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    source = self.args["source"]
    for name, detail in source.items():

        filename = detail["filename"]
        data = json.load(open(filename))

        updated_detail = {
            "df": data,
            "transform": self.name,
            "frametype": "dict",
            "params": [
                {
                    "type": "compute",
                },
                {
                    "type": "lineage",
                    "transform": self.name,
                    "dependencies": [
                        {"type": "file", "nature": "input", "objects": [filename]}
                    ],
                },
            ],
            "history": [
                # Add a log entry describing the change
                {"transform": self.name, "log": "Loaded json file"}
            ],
        }

        # Update the state.
        state.update_frame(name, updated_detail, create=True)

    ###########################################
    # => Return
    ###########################################
    return state

validate_args(what, state)

Double check the arguments

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def validate_args(self, what, state):
    """
    Double check the arguments
    """
    assert isinstance(self.args, dict)
    assert "source" in self.args
    assert isinstance(self.args["source"], dict)

    source = self.args["source"]
    for name, detail in source.items():
        assert ("frametype" in detail) and (detail["frametype"] == "dict")
        assert "filename" in detail
        assert "params" in detail

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """

    source = self.args["source"]
    for name, detail in source.items():
        if not state.reached_stage(name, self.name):
            raise Exception("Could not find new frame created for {}".format(name))

            detail = state.get_frame(name)
            df = detail["df"]

            # Check if it is a valid dictionary...
            assert isinstance(df, dict)

PQExport(*args, **kwargs)

Bases: Sink

Parquet export for dataframes.

The configuration requires a list of exports, each of which specifies a pattern for the frame name::

 'conf': {
    'args': {
        "exports": [
          {
              "name": "%(frame)s_pq",
              "type": "pq", # optional. Default is pq
              "frames": ["cars"],
              "filename": "%(output)s/%(runid)s/%(frame)s.pq",
              "params": {
                  # parquet parameters.
                  # "compression": 'gzip'
                  # "engine": 'auto'
                  # "index" :None,
                  # "partition_cols": None
              }
           }
        ]
    }
}
Source code in enrichsdk/contrib/transforms/pqexport/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "PQExport"
    self.roles_supported = ["Export"]
    self.roles_current = "Export"

    data_root = os.path.join(os.environ["ENRICH_TEST"], self.name)
    self.testdata = {
        "data_root": data_root,
        "outputdir": os.path.join(data_root, "runs"),
        "inputdir": os.path.join(os.environ["ENRICH_TEST"]),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {
            "args": {
                "exports": [
                    {
                        "name": "%(frame)s_pq",
                        # "type": "pq",
                        "frames": ["cars"],
                        "filename": "%(output)s/%(runid)s/%(frame)s.pq",
                        "sample": True,
                        "params": {
                            # "compression": 'gzip'
                            # "engine": 'auto'
                            # "index" :None,
                            # "partition_cols": None
                        },
                    }
                ]
            }
        },
        "data": {
            "cars": {
                "transform": "CarSales",
                "filename": "sales.csv",
                "params": {"sep": ","},
            }
        },
    }

process(state)

Export frames as parquet files as shown in the example.

Source code in enrichsdk/contrib/transforms/pqexport/__init__.py
def process(self, state):
    """
    Export frames as parquet files as shown in the example.
    """

    # Sanity check...
    for e in self.args["exports"]:

        namebase = e["name"]
        params = e.get("params", {})
        sample = e.get("sample", True)

        # Collect all the frames data
        for f in e["frames"]:

            detail = state.get_frame(f)
            if detail is None:
                raise Exception("Frame not present in state: {}".format(f))

            if detail["frametype"] != "pandas":
                raise Exception("Frame not a pandas dataframe: {}".format(f))
                continue

            df = detail["df"]

            # Resolve the locations
            filename = os.path.abspath(
                self.config.get_file(e["filename"], extra={"frame": f})
            )
            relpath = self.config.get_relative_path(
                filename, what="enrich_data_dir"
            )

            # Check over-rides
            overrides = self.frame_get_overrides(detail)
            save = overrides.get("save", True)
            if save:
                try:
                    os.makedirs(os.path.dirname(filename))
                except:
                    pass
                df.to_parquet(filename, **params)

                if sample:
                    size = min(1000, df.shape[0])
                    df.sample(size).to_parquet(filename + ".sample", **params)

            else:
                logger.warn(
                    "Did not save {} due to overrides".format(f),
                    extra=self.config.get_extra(
                        {
                            "transform": self.name,
                            "data": "Overrides: {}".format(overrides),
                        }
                    ),
                )

            if not os.path.exists(filename):
                logger.error(
                    "PQ file not created or missing",
                    extra=self.config.get_extra(
                        {
                            "transform": self.name,
                            "data": "Filename: {}\nOverride Present: {}".format(
                                filename, override_present
                            ),
                        }
                    ),
                )
                raise Exception("PQ file missing")

            # => Create state detail
            state_detail = {
                "df": None,
                "frametype": "db",
                "transform": self.name,
                "params": [
                    {
                        "filename": filename,
                        "action": "output",
                        "frametype": "binary",
                        "columns": self.collapse_columns(detail),
                        "descriptions": ["Parquet export of {} frame".format(f)],
                        "components": [
                            {
                                "filename": relpath,
                                "type": "pq",
                                "rows": df.shape[0],
                                "columns": df.shape[1],
                                "sha256sum": get_checksum(filename),
                                "filesize": "{0:0.1f} MB".format(
                                    get_file_size(filename) / (1024 * 1024)
                                ),
                                "modified_time": str(
                                    time.ctime(os.path.getmtime(filename))
                                ),
                                "create_time": str(
                                    time.ctime(os.path.getctime(filename))
                                ),
                            }
                        ],
                    },
                    {
                        "type": "lineage",
                        "transform": self.name,
                        "dependencies": [
                            {
                                "type": "dataframe",
                                "nature": "input",
                                "objects": [f],
                            },
                            {
                                "type": "file",
                                "nature": "output",
                                "objects": [filename],
                            },
                        ],
                    },
                ],
                "history": [{"transform": self.name, "log": "Write PQite export"}],
            }
            try:
                name = namebase % {"frame": f}
                state.update_frame(name, state_detail, create=True)
                state.make_note("Generated PQ export for {}".format(f))
            except:
                logger.exception(
                    "Unable to store state",
                    extra=self.config.get_extra({"transform": self.name}),
                )
                raise Exception("Error while storing")

SQLExport(*args, **kwargs)

Bases: Sink

Export dataframes into the SQL database. Args specify what and how the export should happen.

The transform args provides the specification:

  * exports: A list of files that must be exported. Each is a
    dictionary with the following elements:

      * name: Name of this export. Used for internal tracking and notifications.
      * filename: Output filename. Can refer to other global attributes such as `data_root`, `enrich_root_dir` etc
      * type: Type of the export. Only `sqlite` supported for now
      * frames: List of frames of the type `pandas` that should
        exported as part of this file

Example::

....
"transforms": {
    "enabled": [
       ...
       {
         "transform": "SQLExport",
          "args": {
              "exports": [
                {
                   "type": "sqlite",
                   "filename": "%(output)s/cars.sqlite",
                   "frames": ["cars", "alpha"]
                },
               ...
              ]
            },
           ...
       }
    ...
   }
 }
Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "SQLExport"
    self.description = "Export frames into SQLite/Other formats"
    self.roles_supported = ["Export"]
    self.roles_current = "Export"

    self.testdata = {
        "conf": {
            "args": {
                "exports": [
                    {
                        "name": "customerinfo",
                        "filename": "%(output)s/%(runid)s/customerinfo.sqlite",
                        "type": "sqlite",
                        "frames": ["customerinfo"],
                    }
                ]
            }
        },
        "data": {
            "customerinfo": {
                "transform": "MemberLoyaltyMetadata",
                "filename": "customerinfo.csv",
                "params": {"sep": ",", "dtype": {"MEMBERSHIP_ID_SUFFIX": "str"}},
            }
        },
    }

preload_clean_args(args)

Enforce the args specification given in the example above

Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def preload_clean_args(self, args):
    """
    Enforce the args specification given in the
    example above
    """
    args = super().preload_clean_args(args)

    if len(args) == 0:
        raise Exception("Empty args provided")

    if ("exports" not in args) or (not isinstance(args["exports"], list)):
        raise Exception("SQLExport requires a series of exports (a list)")

    for e in args["exports"]:

        if (
            (not isinstance(e, dict))
            or ("filename" not in e)
            or ("name" not in e)
            or ("type" not in e)
            or ("frames" not in e)
        ):
            raise Exception(
                "Each element of the export should be a dictionary with filename, type, and frames"
            )

        if ("replace" in e) and (not isinstance(e["replace"], bool)):
            raise Exception("replace is a boolean variable")

        if e["type"] != "sqlite":
            raise Exception("Only sqlite exports are supported in current version")

        e["filename"] = os.path.abspath(self.config.get_file(e["filename"]))
        e["relpath"] = os.path.relpath(e["filename"], self.config.data_root)

    return args

process(state)

Execute the export specification.

Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def process(self, state):
    """
    Execute the export specification.
    """

    exports =  self.args["exports"]


    # Look at messages sent from transforms and collect the
    # frames.
    extra = defaultdict(list)
    while True:
        msg = state.msgpop(self.name)
        if msg is None:
            break
        if not isinstance(msg, dict):
            continue
        data = msg.get('data', None)
        if ((isinstance(data, dict)) and
            ('frames' in data)):
            name = data.get('name', 'default')
            frames = data['frames']
            extra[name].extend(frames)

    # Sanity check...
    for e in exports:

        name = e.get('name', 'default')

        # Collect all the frames data
        missing = []
        invalid = []
        frames = {}

        allframes = e["frames"] + extra.get(name, [])
        for f in allframes:
            detail = state.get_frame(f)
            if detail is None:
                missing.append(f)
                continue

            if detail["frametype"] != "pandas":
                invalid.append(f)
                continue

            frames[f] = detail

        if len(missing) > 0 or len(invalid) > 0:
            logger.error(
                "Unable to export frames",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Invalid: {}\nMissing: {}".format(invalid, missing),
                    }
                ),
            )
            raise Exception("Error while exporting")

        filename = e["filename"]
        filename = os.path.abspath(self.config.get_file(filename))
        relpath = self.config.get_relative_path(filename, what="enrich_data_dir")

        name = e.get("name", os.path.basename(filename))
        replace = e.get("replace", False)

        try:
            os.makedirs(os.path.dirname(filename))
        except:
            pass

        # Creating a database file
        conn = sqlite3.connect(filename)
        c = conn.cursor()

        for f in frames:

            # => Write the frames
            overrides = self.frame_get_overrides(frames[f])
            override_present = len(overrides) > 0
            save = overrides.get("save", True)
            if save:
                try:

                    # Status flag
                    failed = False

                    df = frames[f]["df"]

                    # Drop existing table if necessary...
                    if replace:
                        c.execute(f"DROP TABLE IF EXISTS {f}")

                    # => First create the table schema
                    ddl = pd.io.sql.get_schema(df, f)
                    c.execute(ddl)  # CREATE table

                    # => Dump the dataframe to a csv
                    fd, tmpfile = tempfile.mkstemp(prefix="sqlexport")
                    df.to_csv(tmpfile, header=False, index=False)

                    # => Load it into sqlite
                    cmd = [
                        "/usr/bin/sqlite3",
                        filename,
                        "-cmd",
                        ".separator ,",
                        ".import {} {}".format(tmpfile, f),
                    ]

                    process = subprocess.Popen(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    out, err = process.communicate()
                    err = err.decode("utf-8")
                    out = out.decode("utf-8")

                    # Dump what we have seen
                    if len(err) > 0:
                        failed = True
                        logfunc = partial(
                            logger.error, "Unable to export {}".format(f)
                        )
                    else:
                        logfunc = partial(logger.debug, "Exported {}".format(f))
                    logfunc(
                        extra=self.config.get_extra(
                            {
                                "transform": self.name,
                                "data": "Filename:{}\nOutput\n-----\n{}\n\nErr\n----\n{}".format(
                                    filename, out, err
                                ),
                            }
                        )
                    )

                    # => Update the state for this transform..
                    if not failed:
                        state.update_frame(
                            f,
                            {
                                "df": frames[f]["df"],
                                "frametype": frames[f]["frametype"],
                                "transform": self.name,
                                "history": [{"log": "Exported to SQLite"}],
                                "params": {
                                    "type": "lineage",
                                    "transform": self.name,
                                    "dependencies": [
                                        {
                                            "type": "dataframe",
                                            "nature": "input",
                                            "objects": [f],
                                        },
                                        {
                                            "type": "file",
                                            "nature": "output",
                                            "objects": [filename],
                                        },
                                    ],
                                },
                            },
                        )

                except:
                    logger.exception(
                        "Unable to export dataframe {}".format(f),
                        extra=self.config.get_extra(
                            {
                                "transform": self.name,
                            }
                        ),
                    )

                # Cleanup...
                try:
                    if os.path.exists(tmpfile):
                        os.remove(tmpfile)
                except:
                    pass

                # Dont proceed
                if failed:
                    raise Exception("Error while exporting {}".format(f))

            else:
                logger.warn(
                    "Did not save {} due to overrides".format(f),
                    extra=self.config.get_extra(
                        {
                            "transform": self.name,
                            "data": "Overrides: {}".format(overrides),
                        }
                    ),
                )

        conn.close()
        if not os.path.exists(filename):
            logger.error(
                "SQLite file not created or missing",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Filename: {}\nOverride Present: {}".format(
                            filename, override_present
                        ),
                    }
                ),
            )
            raise Exception("SQLite file missing")

        # Now create the state entry
        detail = {
            "df": None,
            "frametype": "db",
            "transform": self.name,
            "params": {
                "filename": filename,
                "action": "output",
                "frametype": "db",
                "descriptions": [
                    "SQLite export of {} frames ({})".format(
                        len(frames), ",".join(frames)
                    )
                ],
                "notes": ["Frames included: {}".format(",".join(frames))],
                "components": [
                    {
                        "filename": relpath,
                        "type": "sqlite",
                        "sha256sum": get_checksum(filename),
                        "filesize": "{0:0.3f} MB".format(
                            get_file_size(filename) / (1024 * 1024)
                        ),
                        "modified_time": str(
                            time.ctime(os.path.getmtime(filename))
                        ),
                        "create_time": str(time.ctime(os.path.getctime(filename))),
                    }
                ],
            },
            "history": [{"transform": self.name, "log": "Write SQLite export"}],
        }
        try:
            state.update_frame(name, detail, create=True)
            state.make_note("Generated database export")
        except:
            logger.exception(
                "Unable to store state",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Error while storing")

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

TableSink(*args, **kwargs)

Bases: Sink

Transform to dump dataframes in state into files.

Parameters specific to this module include:

* sink: A dictionary of dataframe names and how to output them. It has a number of attributes:

    * type: Output type. Only 'table' value is supported for this
      option right now.
    * filename: Output filename. You can use default parameters such
      runid

  The name of the dataframe can be a regular expression allowing you
  specify a simple rule for arbitrary number of frames.

Example::

....
"transforms": {
    "enabled": [
        ...
        {
            "transform": "TableSink",
            "args": {
                "sink": {
                    "article": {
                        "frametype": "pandas",
                        "filename": "%(output)s/%(runid)s/article.csv",
                        "params": {
                            "sep": "|"
                        }
                    },
                    ...
                }
            }
            ...
        }
    ]
 }
Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "TableSink"

    self.testdata = {
        "conf": {
            "args": {
                "sink": {
                    "cars": {
                        "frametype": "pandas",
                        "filename": "%(output)s/cars_revised.csv",
                        "params": {"sep": ","},
                    }
                }
            }
        },
        "data": {
            "carmodel": {
                "transform": "CarModel",
                "filename": "cars.csv",
                "params": {"sep": ","},
                "state": {
                    "params": [
                        {
                            "type": "args",
                            "transform": "TableSink",
                            "args": {"save": False, "rows": 124},
                        }
                    ]
                },
            }
        },
    }

preload_clean_args(args)

Check to make sure that the arguments is consistent with the specification mentioned above

Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def preload_clean_args(self, args):
    """
    Check to make sure that the arguments
    is consistent with the specification mentioned
    above
    """

    # Sanity check...
    assert isinstance(args, dict)
    assert len(args) > 0

    # Backward compatability
    if "sink" not in args:
        args = {"sink": args}

    args = super().preload_clean_args(args)

    assert "sink" in args
    sink = args["sink"]
    assert isinstance(sink, dict)
    assert len(sink) > 0

    for pattern, detail in sink.items():

        if (
            ("type" in detail)
            and ("frametype" not in detail)
            and (detail["type"] == "table")
        ):
            detail["frametype"] = "pandas"

        if ("frametype" not in detail) or (detail["frametype"] != "pandas"):
            logger.error(
                "Invalid configuration. Only pandas table source supported by this sink transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        detail["root"] = self.config.enrich_data_dir

        tags = detail.get("tags", [])
        if isinstance(tags, str):
            tags = [tags]
        detail["tags"] = tags

        sortcols = detail.get("sort", [])
        if isinstance(sortcols, str):
            sortcols = [sortcols]
        detail["sort"] = sortcols

    return args

process(state)

Execute the tablesink specification

Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def process(self, state):
    """
    Execute the tablesink specification
    """

    # => What exists in the state?
    available_frames = state.get_frame_list()

    # => First construct input for the pandasframe
    extra = {}
    write_input = {}
    args_input = {}
    framecls = self.config.get_dataframe("pandas")
    skipped = []

    sink = self.args["sink"]
    for pattern in sink:
        # The pattern could be precise dataframe name or could be
        # regular expression.
        regex = re.compile("^{}$".format(pattern))
        frames = [
            m.group(0) for f in available_frames for m in [regex.search(f)] if m
        ]

        for f in frames:
            # For each dataframe that is in the system

            detail = state.get_frame(f)

            # => Are there any extra instructions?
            overrides = self.frame_get_overrides(detail)

            # => Materialize the path...
            filename = sink[pattern]["filename"]
            filename = self.config.get_file(
                filename, create_dir=True, extra={"frame_name": f}
            )
            # Collect all column information
            extra[f] = {
                "columns": self.collapse_columns(detail),
                "notes": self.collapse_notes(detail),
                "descriptions": self.collapse_descriptions(detail),
                "overrides": overrides,
            }

            # Which dataframe
            df = detail["df"]

            # Get the
            frametype = detail["frametype"]

            # Order the dataframe if it is needed
            sortcols = sink[pattern]["sort"]
            if len(sortcols) > 0:
                df = framecls.sort_values(df, sortcols, ascending=False)
            params = sink[pattern].get("params", {})

            # Should I be writing this csv?
            save = params.get("save", True)
            save = overrides.get("save", save)

            if not save:
                skipped.append(f)

            write_input[f] = {
                "save": save,
                "frametype": frametype,
                "pattern": pattern,
                "df": df,
                "filename": filename,
                "params": params,
            }

            args_input[f] = copy.copy(sink[pattern])
            args_input[f]["filename"] = filename

    if len(skipped) > 0:
        logger.warning(
            "Not saving {} tables".format(len(skipped)),
            extra={"transform": self.name, "data": skipped},
        )

    # => Write output details
    framecls.write(args_input, write_input)

    for name in write_input:

        detail = write_input[name]

        # => Insert columns and tags
        pattern = detail["pattern"]

        #
        detail["params"]["tags"] = sink[pattern]["tags"]

        # Incorporate columns, notes and description
        additional_params = extra[name]
        overrides = additional_params.pop("overrides", {})

        detail["params"].update(additional_params)

        # Insert any overrides provided in the state
        if "rows" in overrides:
            detail["params"]["components"][0]["rows"] = overrides["rows"]

        detail["params"] = [
            detail["params"],
            {
                "type": "lineage",
                "transform": self.name,
                "dependencies": [
                    {"type": "dataframe", "nature": "input", "objects": [name]},
                    {
                        "type": "file",
                        "nature": "output",
                        "objects": [detail["filename"]],
                    },
                ],
            },
        ]

        # Insert additional detail
        detail["transform"] = self.name
        detail["history"] = [{"transform": self.name, "log": "Wrote output"}]

        state.update_frame(name, detail)

    logger.debug(
        "Finished writing data",
        extra=self.config.get_extra({"transform": self.name}),
    )

validate_args(what, state)

Extra validation of the arguments

Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def validate_args(self, what, state):
    """
    Extra validation of the arguments
    """
    sink = self.args["sink"]

    assert isinstance(sink, dict)
    for pattern, detail in sink.items():
        assert ("frametype" in detail) and (detail["frametype"] == "pandas")
        assert "filename" in detail
        assert "params" in detail

TableSource(*args, **kwargs)

Bases: Source

Load csv/other files into pandas dataframes.

Parameters specific to this module include:

* source: A dictionary of dataframe names and how to
  load them. It has a number of attributes:

    * type: Output type. Only 'table' value is
      supported for this option.
    * filename: Output filename. You can use default
      parameters such  runid
    * params: Params are arguments to [pandas read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)

Example::

....
"transforms": {
    "enabled": [
        {
            "transform": "TableSink",
            "args": {
                "source": {
                    "article": {
                        "type": "file",
                        "filename": "%(data)s/ArticleData.csv",
                        "params": {
                            "delimiter": "|",
                            "dtype": {
                                "sku": "category",
                                "mc_code": "int64",
                                "sub_class": "category",
                                "priority": "float64"
                                ...
                            }
                        }
                    }
                }
              ...
            }
        }
    ...
   ]
 }
Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "TableSource"

clean(state)

This is meant for subclass to do some additional processing.

Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def clean(self, state):
    """
    This is meant for subclass to do some additional processing.
    """
    pass

preload_clean_args(args)

Clean when the spec is loaded...

Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def preload_clean_args(self, args):
    """
    Clean when the spec is loaded...
    """

    # Backward compatability
    if "source" not in args:
        args = {"source": args}

    args = super().preload_clean_args(args)

    # Sanity check...
    assert isinstance(args, dict)
    assert "source" in args
    assert isinstance(args["source"], dict)

    for name, detail in args["source"].items():

        # Insert the frame into the args for backward
        # compatability.
        if (
            ("type" in detail)
            and ("frametype" not in detail)
            and (detail["type"] == "table")
        ):
            detail["frametype"] = "pandas"

        if ("frametype" not in detail) or (detail["frametype"] != "pandas"):
            logger.error(
                "Invalid configuration. Only pandas table source supported by this source transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        mapping = {
            "str": str,
            "float": np.float64,
            "float64": np.float64,
            "np.float64": np.float64,
            "np.int64": np.int64,
            "int": np.int64,
            "int64": np.int64,
            "datetime": datetime,
            "category": "category",
        }

        # => Materialize the path...
        detail["filename"] = self.config.get_file(detail["filename"])
        detail["root"] = self.config.enrich_data_dir
        params = detail["params"]
        if "dtype" in params:
            for attr in params["dtype"]:
                if params["dtype"][attr] in mapping:
                    params["dtype"][attr] = mapping[params["dtype"][attr]]
                else:
                    params["dtype"][attr] = eval(params["dtype"][attr])

    return args

process(state)

Load file...

Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def process(self, state):
    """
    Load file...
    """
    # Load all the dataframes. This will use the full enrich
    # deployment's beefed up read function.
    framecls = self.config.get_dataframe("pandas")
    source = self.args["source"]

    dfstates = framecls.read(source, {})
    for dfname, dfstate in dfstates.items():

        # => Insert column description
        columns = dfstate["params"]["columns"]
        for c in columns:
            columns[c]["description"] = self.get_column_description(dfname, c)

        params = dfstate["params"]
        if "filename" in params:
            filename = params["filename"]
        elif "filename" in source.get(dfname, {}):
            filename = self.args[dfname]["filename"]
        else:
            filename = "Unknown"

        detail = {
            "df": dfstate["df"],
            "transform": self.name,
            "frametype": "pandas",
            "params": [
                params,
                {
                    "type": "lineage",
                    "transform": self.name,
                    "dependencies": [
                        {"type": "file", "nature": "input", "objects": [filename]}
                    ],
                },
            ],
            "history": [
                {
                    "transform": self.name,
                    "log": "Read data using {}".format(framecls.__class__.__name__),
                }
            ],
        }
        try:
            state.update_frame(dfname, detail, create=True)
        except:
            logger.exception(
                "Unable to store state",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Error while loading")

    # Clean the uploaded data...
    self.clean(state)

fileops

FileOperations(*args, **kwargs)

Bases: FileOperationsBase

FileOperations performs a number of operations on files generated by pipelines.

The transform takes a list of actions. The only action type supported for now is copy. Each copy task requires source, destination, and instruction on what to do with existing file.

Example::

{
        "transform": "FileOperations",
        "enable": true,
        "dependencies": {
           ....
        },
        "args": {
            "actions": [
                  {
                    "action": "copy",
                    "src": "%(output)s/%(runid)s/profile.sqlite",
                    "dst": "%(data_root)s/shared/campaigns/profile_daily/profile.sqlite",
                    "backupsuffix": ".backup"
                  }
             ]
        }
}
Source code in enrichsdk/contrib/transforms/fileops/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "FileOperations"
    self.description = "File operations such as move/copy etc."

    test_root = os.environ["ENRICH_TEST"]
    self.testdata = {
        "data_root": os.path.join(test_root, self.name),
        "outputdir": os.path.join(test_root, self.name, "output"),
        "inputdir": test_root,
        "statedir": os.path.join(test_root, self.name, "state"),
        "global": {"args": {"rundate": "2020-01-10"}},
        "conf": {
            "args": {
                "actions": [
                    {
                        "action": "copy",
                        "src": "%(output)s/%(runid)s/outputs/cars.csv",
                        "dst": "%(data_root)s/shared/%(rundate)s/hello.csv",
                    }
                ]
            }
        },
        "data": {},
    }
instantiable() classmethod

Make this class instantiable

Source code in enrichsdk/contrib/transforms/fileops/__init__.py
@classmethod
def instantiable(cls):
    """
    Make this class instantiable
    """
    return True

jsonsink

JSONSink(*args, **kwargs)

Bases: Sink

Store a 'dict' frame that is present in the state into a file.

Params are meant to be passed as parameter to update_frame.

Example configuration::

 "args": {
     "sink": {
         'test': {
             'frametype': 'dict',
             'filename': '%(output)s/%(runid)s/mytestoutput.json',
             'params': {}
         }
     }
 }
Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "JSONSink"

    self.testdata = {
        "conf": {
            "args": {
                "sink": {
                    "frame1": {
                        "frametype": "dict",
                        "filename": "%(output)s/%(runid)s/mytestoutput.json",
                        "params": {},
                    }
                }
            }
        },
        "data": {
            "frame1": {
                "filename": "outputjson.json",
                "frametype": "dict",
                "transform": "TestJSON",
                "params": {},
            }
        },
    }
preload_clean_args(args)

Clean when the spec is loaded...

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def preload_clean_args(self, args):
    """
    Clean when the spec is loaded...
    """

    if "sink" not in args:
        args = {"sink": args}

    args = super().preload_clean_args(args)

    assert "sink" in args
    assert isinstance(args["sink"], dict)
    assert len(args["sink"]) > 0

    sink = args["sink"]
    for name, detail in sink.items():

        if ("frametype" not in detail) or (detail["frametype"] != "dict"):
            logger.error(
                "Invalid configuration. Only JSON/Dictionaries are supported by this sink transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        filename = detail["filename"]
        if not filename.lower().endswith(".json"):
            logger.error(
                "Input file must a .json file",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        detail["root"] = self.config.enrich_data_dir

        tags = detail.get("tags", [])
        if isinstance(tags, str):
            tags = [tags]
        detail["tags"] = tags

        # => Materialize the path...
        detail["filename"] = self.config.get_file(
            detail["filename"], extra={"frame_name": name}
        )

    return args
process(state)

Store the dictionary 'frames' in the state in files.

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def process(self, state):
    """
    Store the dictionary 'frames' in the state in files.
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    available_frames = state.get_frame_list()

    # => First construct input for the pandasframe
    extra = {}
    args_input = {}
    write_input = {}
    framecls = self.config.get_dataframe("dict")

    sink = self.args["sink"]
    for pattern in sink:
        # The pattern could be precise dataframe name or could be
        # regular expression.
        regex = re.compile("^{}$".format(pattern))
        frames = [
            m.group(0) for f in available_frames for m in [regex.search(f)] if m
        ]
        if len(frames) == 0:
            logger.warning("Pattern has not matched any frames: {}".format(pattern))
            continue

        for f in frames:

            # Get the details of this frame
            detail = state.get_frame(f)

            # Handle frametype
            frametype = detail["frametype"]
            if frametype != "dict":
                logger.warning(
                    "Pattern has matched non-dict frame: {}".format(f),
                    extra=self.config.get_extra({"transform": self.name}),
                )
                continue

            # Now construct the output file name
            filename = sink[pattern]["filename"]
            filename = self.config.get_file(
                filename, create_dir=True, extra={"frame_name": f}
            )

            extra[f] = {
                "notes": self.collapse_notes(detail),
                "descriptions": self.collapse_descriptions(detail),
            }

            params = sink[pattern].get("params", {})
            write_input[f] = {
                "frametype": detail["frametype"],
                "filename": filename,
                "pattern": pattern,
                "df": detail["df"],
                "params": params,
            }

            args_input[f] = copy.copy(sink[pattern])
            args_input[f]["filename"] = filename

    framecls.write(args_input, write_input)

    for name in write_input:

        detail = write_input[name]

        # => Insert columns and tags
        pattern = detail["pattern"]
        detail["params"]["tags"] = sink[pattern]["tags"]

        # Incorporate columns, notes and description
        detail["params"].update(extra[name])

        detail["params"] = [
            detail["params"],
            {
                "type": "lineage",
                "transform": self.name,
                "dependencies": [
                    {"type": "dataframe", "nature": "input", "objects": [name]},
                    {
                        "type": "file",
                        "nature": "output",
                        "objects": [detail["params"]["filename"]],
                    },
                ],
            },
        ]

        # Insert additional detail
        detail["transform"] = self.name
        detail["history"] = [{"transform": self.name, "log": "Wrote output"}]

        state.update_frame(name, detail)

    logger.debug(
        "Finished writing data",
        extra=self.config.get_extra({"transform": self.name}),
    )

    ###########################################
    # => Return
    ###########################################
    return state
validate_args(what, state)

An extra check on the arguments to make sure it is consistent with the specification

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def validate_args(self, what, state):
    """
    An extra check on the arguments to make sure
    it is consistent with the specification
    """
    args = self.args
    assert "sink" in args
    assert isinstance(args["sink"], dict)
    assert len(args["sink"]) > 0

    sink = args["sink"]
    for name, detail in sink.items():
        assert ("frametype" in detail) and (detail["frametype"] == "dict")
        assert "filename" in detail
        assert "params" in detail
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/transforms/jsonsink/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

jsonsource

JSONSource(*args, **kwargs)

Bases: Source

Load a file into a 'dict' frame in the state.

Params are meant to be passed as parameter to update_frame.

Example configuration::

 ...

 "args": {
     "source": {
         'hello': {
             'frametype': 'dict',
             'filename': '%(data_root)s/shared/hello.json',
             'params': {}
         }
     }
 }
Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "JSONSource"

    self.testdata = {
        "conf": {
            "args": {
                "source": {
                    "hello": {
                        "filename": "%(data_root)s/shared/hello.json",
                        "frametype": "dict",
                        "params": {},
                    }
                }
            }
        },
        "data": {},
    }
preload_clean_args(args)

Check if the args are consistent with the specification.

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def preload_clean_args(self, args):
    """
    Check if the args are consistent with the
    specification.
    """

    # Backward compatabilty
    if "source" not in args:
        args = {"source": args}

    args = super().preload_clean_args(args)

    # Sanity check...
    assert isinstance(args, dict)
    assert "source" in args
    assert isinstance(args["source"], dict)

    source = args["source"]
    for name, detail in source.items():

        if ("frametype" not in detail) or (detail["frametype"] != "dict"):
            logger.error(
                "Invalid configuration. Only JSON source supported by this source transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        filename = detail["filename"]
        if not filename.lower().endswith(".json"):
            logger.error(
                "Input file must a .json file",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        # => Materialize the path...
        detail["filename"] = self.config.get_file(detail["filename"])

    return args
process(state)

Load the json files into 'dict' frames and store them in the state.

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def process(self, state):
    """
    Load the json files into 'dict' frames and store them in the state.
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    source = self.args["source"]
    for name, detail in source.items():

        filename = detail["filename"]
        data = json.load(open(filename))

        updated_detail = {
            "df": data,
            "transform": self.name,
            "frametype": "dict",
            "params": [
                {
                    "type": "compute",
                },
                {
                    "type": "lineage",
                    "transform": self.name,
                    "dependencies": [
                        {"type": "file", "nature": "input", "objects": [filename]}
                    ],
                },
            ],
            "history": [
                # Add a log entry describing the change
                {"transform": self.name, "log": "Loaded json file"}
            ],
        }

        # Update the state.
        state.update_frame(name, updated_detail, create=True)

    ###########################################
    # => Return
    ###########################################
    return state
validate_args(what, state)

Double check the arguments

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def validate_args(self, what, state):
    """
    Double check the arguments
    """
    assert isinstance(self.args, dict)
    assert "source" in self.args
    assert isinstance(self.args["source"], dict)

    source = self.args["source"]
    for name, detail in source.items():
        assert ("frametype" in detail) and (detail["frametype"] == "dict")
        assert "filename" in detail
        assert "params" in detail
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/transforms/jsonsource/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """

    source = self.args["source"]
    for name, detail in source.items():
        if not state.reached_stage(name, self.name):
            raise Exception("Could not find new frame created for {}".format(name))

            detail = state.get_frame(name)
            df = detail["df"]

            # Check if it is a valid dictionary...
            assert isinstance(df, dict)

pqexport

PQExport(*args, **kwargs)

Bases: Sink

Parquet export for dataframes.

The configuration requires a list of exports, each of which specifies a pattern for the frame name::

 'conf': {
    'args': {
        "exports": [
          {
              "name": "%(frame)s_pq",
              "type": "pq", # optional. Default is pq
              "frames": ["cars"],
              "filename": "%(output)s/%(runid)s/%(frame)s.pq",
              "params": {
                  # parquet parameters.
                  # "compression": 'gzip'
                  # "engine": 'auto'
                  # "index" :None,
                  # "partition_cols": None
              }
           }
        ]
    }
}
Source code in enrichsdk/contrib/transforms/pqexport/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "PQExport"
    self.roles_supported = ["Export"]
    self.roles_current = "Export"

    data_root = os.path.join(os.environ["ENRICH_TEST"], self.name)
    self.testdata = {
        "data_root": data_root,
        "outputdir": os.path.join(data_root, "runs"),
        "inputdir": os.path.join(os.environ["ENRICH_TEST"]),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {
            "args": {
                "exports": [
                    {
                        "name": "%(frame)s_pq",
                        # "type": "pq",
                        "frames": ["cars"],
                        "filename": "%(output)s/%(runid)s/%(frame)s.pq",
                        "sample": True,
                        "params": {
                            # "compression": 'gzip'
                            # "engine": 'auto'
                            # "index" :None,
                            # "partition_cols": None
                        },
                    }
                ]
            }
        },
        "data": {
            "cars": {
                "transform": "CarSales",
                "filename": "sales.csv",
                "params": {"sep": ","},
            }
        },
    }
process(state)

Export frames as parquet files as shown in the example.

Source code in enrichsdk/contrib/transforms/pqexport/__init__.py
def process(self, state):
    """
    Export frames as parquet files as shown in the example.
    """

    # Sanity check...
    for e in self.args["exports"]:

        namebase = e["name"]
        params = e.get("params", {})
        sample = e.get("sample", True)

        # Collect all the frames data
        for f in e["frames"]:

            detail = state.get_frame(f)
            if detail is None:
                raise Exception("Frame not present in state: {}".format(f))

            if detail["frametype"] != "pandas":
                raise Exception("Frame not a pandas dataframe: {}".format(f))
                continue

            df = detail["df"]

            # Resolve the locations
            filename = os.path.abspath(
                self.config.get_file(e["filename"], extra={"frame": f})
            )
            relpath = self.config.get_relative_path(
                filename, what="enrich_data_dir"
            )

            # Check over-rides
            overrides = self.frame_get_overrides(detail)
            save = overrides.get("save", True)
            if save:
                try:
                    os.makedirs(os.path.dirname(filename))
                except:
                    pass
                df.to_parquet(filename, **params)

                if sample:
                    size = min(1000, df.shape[0])
                    df.sample(size).to_parquet(filename + ".sample", **params)

            else:
                logger.warn(
                    "Did not save {} due to overrides".format(f),
                    extra=self.config.get_extra(
                        {
                            "transform": self.name,
                            "data": "Overrides: {}".format(overrides),
                        }
                    ),
                )

            if not os.path.exists(filename):
                logger.error(
                    "PQ file not created or missing",
                    extra=self.config.get_extra(
                        {
                            "transform": self.name,
                            "data": "Filename: {}\nOverride Present: {}".format(
                                filename, override_present
                            ),
                        }
                    ),
                )
                raise Exception("PQ file missing")

            # => Create state detail
            state_detail = {
                "df": None,
                "frametype": "db",
                "transform": self.name,
                "params": [
                    {
                        "filename": filename,
                        "action": "output",
                        "frametype": "binary",
                        "columns": self.collapse_columns(detail),
                        "descriptions": ["Parquet export of {} frame".format(f)],
                        "components": [
                            {
                                "filename": relpath,
                                "type": "pq",
                                "rows": df.shape[0],
                                "columns": df.shape[1],
                                "sha256sum": get_checksum(filename),
                                "filesize": "{0:0.1f} MB".format(
                                    get_file_size(filename) / (1024 * 1024)
                                ),
                                "modified_time": str(
                                    time.ctime(os.path.getmtime(filename))
                                ),
                                "create_time": str(
                                    time.ctime(os.path.getctime(filename))
                                ),
                            }
                        ],
                    },
                    {
                        "type": "lineage",
                        "transform": self.name,
                        "dependencies": [
                            {
                                "type": "dataframe",
                                "nature": "input",
                                "objects": [f],
                            },
                            {
                                "type": "file",
                                "nature": "output",
                                "objects": [filename],
                            },
                        ],
                    },
                ],
                "history": [{"transform": self.name, "log": "Write PQite export"}],
            }
            try:
                name = namebase % {"frame": f}
                state.update_frame(name, state_detail, create=True)
                state.make_note("Generated PQ export for {}".format(f))
            except:
                logger.exception(
                    "Unable to store state",
                    extra=self.config.get_extra({"transform": self.name}),
                )
                raise Exception("Error while storing")

sqlexport

SQLExport(*args, **kwargs)

Bases: Sink

Export dataframes into the SQL database. Args specify what and how the export should happen.

The transform args provides the specification:

  * exports: A list of files that must be exported. Each is a
    dictionary with the following elements:

      * name: Name of this export. Used for internal tracking and notifications.
      * filename: Output filename. Can refer to other global attributes such as `data_root`, `enrich_root_dir` etc
      * type: Type of the export. Only `sqlite` supported for now
      * frames: List of frames of the type `pandas` that should
        exported as part of this file

Example::

....
"transforms": {
    "enabled": [
       ...
       {
         "transform": "SQLExport",
          "args": {
              "exports": [
                {
                   "type": "sqlite",
                   "filename": "%(output)s/cars.sqlite",
                   "frames": ["cars", "alpha"]
                },
               ...
              ]
            },
           ...
       }
    ...
   }
 }
Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "SQLExport"
    self.description = "Export frames into SQLite/Other formats"
    self.roles_supported = ["Export"]
    self.roles_current = "Export"

    self.testdata = {
        "conf": {
            "args": {
                "exports": [
                    {
                        "name": "customerinfo",
                        "filename": "%(output)s/%(runid)s/customerinfo.sqlite",
                        "type": "sqlite",
                        "frames": ["customerinfo"],
                    }
                ]
            }
        },
        "data": {
            "customerinfo": {
                "transform": "MemberLoyaltyMetadata",
                "filename": "customerinfo.csv",
                "params": {"sep": ",", "dtype": {"MEMBERSHIP_ID_SUFFIX": "str"}},
            }
        },
    }
preload_clean_args(args)

Enforce the args specification given in the example above

Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def preload_clean_args(self, args):
    """
    Enforce the args specification given in the
    example above
    """
    args = super().preload_clean_args(args)

    if len(args) == 0:
        raise Exception("Empty args provided")

    if ("exports" not in args) or (not isinstance(args["exports"], list)):
        raise Exception("SQLExport requires a series of exports (a list)")

    for e in args["exports"]:

        if (
            (not isinstance(e, dict))
            or ("filename" not in e)
            or ("name" not in e)
            or ("type" not in e)
            or ("frames" not in e)
        ):
            raise Exception(
                "Each element of the export should be a dictionary with filename, type, and frames"
            )

        if ("replace" in e) and (not isinstance(e["replace"], bool)):
            raise Exception("replace is a boolean variable")

        if e["type"] != "sqlite":
            raise Exception("Only sqlite exports are supported in current version")

        e["filename"] = os.path.abspath(self.config.get_file(e["filename"]))
        e["relpath"] = os.path.relpath(e["filename"], self.config.data_root)

    return args
process(state)

Execute the export specification.

Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def process(self, state):
    """
    Execute the export specification.
    """

    exports =  self.args["exports"]


    # Look at messages sent from transforms and collect the
    # frames.
    extra = defaultdict(list)
    while True:
        msg = state.msgpop(self.name)
        if msg is None:
            break
        if not isinstance(msg, dict):
            continue
        data = msg.get('data', None)
        if ((isinstance(data, dict)) and
            ('frames' in data)):
            name = data.get('name', 'default')
            frames = data['frames']
            extra[name].extend(frames)

    # Sanity check...
    for e in exports:

        name = e.get('name', 'default')

        # Collect all the frames data
        missing = []
        invalid = []
        frames = {}

        allframes = e["frames"] + extra.get(name, [])
        for f in allframes:
            detail = state.get_frame(f)
            if detail is None:
                missing.append(f)
                continue

            if detail["frametype"] != "pandas":
                invalid.append(f)
                continue

            frames[f] = detail

        if len(missing) > 0 or len(invalid) > 0:
            logger.error(
                "Unable to export frames",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Invalid: {}\nMissing: {}".format(invalid, missing),
                    }
                ),
            )
            raise Exception("Error while exporting")

        filename = e["filename"]
        filename = os.path.abspath(self.config.get_file(filename))
        relpath = self.config.get_relative_path(filename, what="enrich_data_dir")

        name = e.get("name", os.path.basename(filename))
        replace = e.get("replace", False)

        try:
            os.makedirs(os.path.dirname(filename))
        except:
            pass

        # Creating a database file
        conn = sqlite3.connect(filename)
        c = conn.cursor()

        for f in frames:

            # => Write the frames
            overrides = self.frame_get_overrides(frames[f])
            override_present = len(overrides) > 0
            save = overrides.get("save", True)
            if save:
                try:

                    # Status flag
                    failed = False

                    df = frames[f]["df"]

                    # Drop existing table if necessary...
                    if replace:
                        c.execute(f"DROP TABLE IF EXISTS {f}")

                    # => First create the table schema
                    ddl = pd.io.sql.get_schema(df, f)
                    c.execute(ddl)  # CREATE table

                    # => Dump the dataframe to a csv
                    fd, tmpfile = tempfile.mkstemp(prefix="sqlexport")
                    df.to_csv(tmpfile, header=False, index=False)

                    # => Load it into sqlite
                    cmd = [
                        "/usr/bin/sqlite3",
                        filename,
                        "-cmd",
                        ".separator ,",
                        ".import {} {}".format(tmpfile, f),
                    ]

                    process = subprocess.Popen(
                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
                    )
                    out, err = process.communicate()
                    err = err.decode("utf-8")
                    out = out.decode("utf-8")

                    # Dump what we have seen
                    if len(err) > 0:
                        failed = True
                        logfunc = partial(
                            logger.error, "Unable to export {}".format(f)
                        )
                    else:
                        logfunc = partial(logger.debug, "Exported {}".format(f))
                    logfunc(
                        extra=self.config.get_extra(
                            {
                                "transform": self.name,
                                "data": "Filename:{}\nOutput\n-----\n{}\n\nErr\n----\n{}".format(
                                    filename, out, err
                                ),
                            }
                        )
                    )

                    # => Update the state for this transform..
                    if not failed:
                        state.update_frame(
                            f,
                            {
                                "df": frames[f]["df"],
                                "frametype": frames[f]["frametype"],
                                "transform": self.name,
                                "history": [{"log": "Exported to SQLite"}],
                                "params": {
                                    "type": "lineage",
                                    "transform": self.name,
                                    "dependencies": [
                                        {
                                            "type": "dataframe",
                                            "nature": "input",
                                            "objects": [f],
                                        },
                                        {
                                            "type": "file",
                                            "nature": "output",
                                            "objects": [filename],
                                        },
                                    ],
                                },
                            },
                        )

                except:
                    logger.exception(
                        "Unable to export dataframe {}".format(f),
                        extra=self.config.get_extra(
                            {
                                "transform": self.name,
                            }
                        ),
                    )

                # Cleanup...
                try:
                    if os.path.exists(tmpfile):
                        os.remove(tmpfile)
                except:
                    pass

                # Dont proceed
                if failed:
                    raise Exception("Error while exporting {}".format(f))

            else:
                logger.warn(
                    "Did not save {} due to overrides".format(f),
                    extra=self.config.get_extra(
                        {
                            "transform": self.name,
                            "data": "Overrides: {}".format(overrides),
                        }
                    ),
                )

        conn.close()
        if not os.path.exists(filename):
            logger.error(
                "SQLite file not created or missing",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Filename: {}\nOverride Present: {}".format(
                            filename, override_present
                        ),
                    }
                ),
            )
            raise Exception("SQLite file missing")

        # Now create the state entry
        detail = {
            "df": None,
            "frametype": "db",
            "transform": self.name,
            "params": {
                "filename": filename,
                "action": "output",
                "frametype": "db",
                "descriptions": [
                    "SQLite export of {} frames ({})".format(
                        len(frames), ",".join(frames)
                    )
                ],
                "notes": ["Frames included: {}".format(",".join(frames))],
                "components": [
                    {
                        "filename": relpath,
                        "type": "sqlite",
                        "sha256sum": get_checksum(filename),
                        "filesize": "{0:0.3f} MB".format(
                            get_file_size(filename) / (1024 * 1024)
                        ),
                        "modified_time": str(
                            time.ctime(os.path.getmtime(filename))
                        ),
                        "create_time": str(time.ctime(os.path.getctime(filename))),
                    }
                ],
            },
            "history": [{"transform": self.name, "log": "Write SQLite export"}],
        }
        try:
            state.update_frame(name, detail, create=True)
            state.make_note("Generated database export")
        except:
            logger.exception(
                "Unable to store state",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Error while storing")
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/transforms/sqlexport/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

tablesink

TableSink(*args, **kwargs)

Bases: Sink

Transform to dump dataframes in state into files.

Parameters specific to this module include:

* sink: A dictionary of dataframe names and how to output them. It has a number of attributes:

    * type: Output type. Only 'table' value is supported for this
      option right now.
    * filename: Output filename. You can use default parameters such
      runid

  The name of the dataframe can be a regular expression allowing you
  specify a simple rule for arbitrary number of frames.

Example::

....
"transforms": {
    "enabled": [
        ...
        {
            "transform": "TableSink",
            "args": {
                "sink": {
                    "article": {
                        "frametype": "pandas",
                        "filename": "%(output)s/%(runid)s/article.csv",
                        "params": {
                            "sep": "|"
                        }
                    },
                    ...
                }
            }
            ...
        }
    ]
 }
Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "TableSink"

    self.testdata = {
        "conf": {
            "args": {
                "sink": {
                    "cars": {
                        "frametype": "pandas",
                        "filename": "%(output)s/cars_revised.csv",
                        "params": {"sep": ","},
                    }
                }
            }
        },
        "data": {
            "carmodel": {
                "transform": "CarModel",
                "filename": "cars.csv",
                "params": {"sep": ","},
                "state": {
                    "params": [
                        {
                            "type": "args",
                            "transform": "TableSink",
                            "args": {"save": False, "rows": 124},
                        }
                    ]
                },
            }
        },
    }
preload_clean_args(args)

Check to make sure that the arguments is consistent with the specification mentioned above

Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def preload_clean_args(self, args):
    """
    Check to make sure that the arguments
    is consistent with the specification mentioned
    above
    """

    # Sanity check...
    assert isinstance(args, dict)
    assert len(args) > 0

    # Backward compatability
    if "sink" not in args:
        args = {"sink": args}

    args = super().preload_clean_args(args)

    assert "sink" in args
    sink = args["sink"]
    assert isinstance(sink, dict)
    assert len(sink) > 0

    for pattern, detail in sink.items():

        if (
            ("type" in detail)
            and ("frametype" not in detail)
            and (detail["type"] == "table")
        ):
            detail["frametype"] = "pandas"

        if ("frametype" not in detail) or (detail["frametype"] != "pandas"):
            logger.error(
                "Invalid configuration. Only pandas table source supported by this sink transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        detail["root"] = self.config.enrich_data_dir

        tags = detail.get("tags", [])
        if isinstance(tags, str):
            tags = [tags]
        detail["tags"] = tags

        sortcols = detail.get("sort", [])
        if isinstance(sortcols, str):
            sortcols = [sortcols]
        detail["sort"] = sortcols

    return args
process(state)

Execute the tablesink specification

Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def process(self, state):
    """
    Execute the tablesink specification
    """

    # => What exists in the state?
    available_frames = state.get_frame_list()

    # => First construct input for the pandasframe
    extra = {}
    write_input = {}
    args_input = {}
    framecls = self.config.get_dataframe("pandas")
    skipped = []

    sink = self.args["sink"]
    for pattern in sink:
        # The pattern could be precise dataframe name or could be
        # regular expression.
        regex = re.compile("^{}$".format(pattern))
        frames = [
            m.group(0) for f in available_frames for m in [regex.search(f)] if m
        ]

        for f in frames:
            # For each dataframe that is in the system

            detail = state.get_frame(f)

            # => Are there any extra instructions?
            overrides = self.frame_get_overrides(detail)

            # => Materialize the path...
            filename = sink[pattern]["filename"]
            filename = self.config.get_file(
                filename, create_dir=True, extra={"frame_name": f}
            )
            # Collect all column information
            extra[f] = {
                "columns": self.collapse_columns(detail),
                "notes": self.collapse_notes(detail),
                "descriptions": self.collapse_descriptions(detail),
                "overrides": overrides,
            }

            # Which dataframe
            df = detail["df"]

            # Get the
            frametype = detail["frametype"]

            # Order the dataframe if it is needed
            sortcols = sink[pattern]["sort"]
            if len(sortcols) > 0:
                df = framecls.sort_values(df, sortcols, ascending=False)
            params = sink[pattern].get("params", {})

            # Should I be writing this csv?
            save = params.get("save", True)
            save = overrides.get("save", save)

            if not save:
                skipped.append(f)

            write_input[f] = {
                "save": save,
                "frametype": frametype,
                "pattern": pattern,
                "df": df,
                "filename": filename,
                "params": params,
            }

            args_input[f] = copy.copy(sink[pattern])
            args_input[f]["filename"] = filename

    if len(skipped) > 0:
        logger.warning(
            "Not saving {} tables".format(len(skipped)),
            extra={"transform": self.name, "data": skipped},
        )

    # => Write output details
    framecls.write(args_input, write_input)

    for name in write_input:

        detail = write_input[name]

        # => Insert columns and tags
        pattern = detail["pattern"]

        #
        detail["params"]["tags"] = sink[pattern]["tags"]

        # Incorporate columns, notes and description
        additional_params = extra[name]
        overrides = additional_params.pop("overrides", {})

        detail["params"].update(additional_params)

        # Insert any overrides provided in the state
        if "rows" in overrides:
            detail["params"]["components"][0]["rows"] = overrides["rows"]

        detail["params"] = [
            detail["params"],
            {
                "type": "lineage",
                "transform": self.name,
                "dependencies": [
                    {"type": "dataframe", "nature": "input", "objects": [name]},
                    {
                        "type": "file",
                        "nature": "output",
                        "objects": [detail["filename"]],
                    },
                ],
            },
        ]

        # Insert additional detail
        detail["transform"] = self.name
        detail["history"] = [{"transform": self.name, "log": "Wrote output"}]

        state.update_frame(name, detail)

    logger.debug(
        "Finished writing data",
        extra=self.config.get_extra({"transform": self.name}),
    )
validate_args(what, state)

Extra validation of the arguments

Source code in enrichsdk/contrib/transforms/tablesink/__init__.py
def validate_args(self, what, state):
    """
    Extra validation of the arguments
    """
    sink = self.args["sink"]

    assert isinstance(sink, dict)
    for pattern, detail in sink.items():
        assert ("frametype" in detail) and (detail["frametype"] == "pandas")
        assert "filename" in detail
        assert "params" in detail

tablesource

TableSource(*args, **kwargs)

Bases: Source

Load csv/other files into pandas dataframes.

Parameters specific to this module include:

* source: A dictionary of dataframe names and how to
  load them. It has a number of attributes:

    * type: Output type. Only 'table' value is
      supported for this option.
    * filename: Output filename. You can use default
      parameters such  runid
    * params: Params are arguments to [pandas read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)

Example::

....
"transforms": {
    "enabled": [
        {
            "transform": "TableSink",
            "args": {
                "source": {
                    "article": {
                        "type": "file",
                        "filename": "%(data)s/ArticleData.csv",
                        "params": {
                            "delimiter": "|",
                            "dtype": {
                                "sku": "category",
                                "mc_code": "int64",
                                "sub_class": "category",
                                "priority": "float64"
                                ...
                            }
                        }
                    }
                }
              ...
            }
        }
    ...
   ]
 }
Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "TableSource"
clean(state)

This is meant for subclass to do some additional processing.

Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def clean(self, state):
    """
    This is meant for subclass to do some additional processing.
    """
    pass
preload_clean_args(args)

Clean when the spec is loaded...

Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def preload_clean_args(self, args):
    """
    Clean when the spec is loaded...
    """

    # Backward compatability
    if "source" not in args:
        args = {"source": args}

    args = super().preload_clean_args(args)

    # Sanity check...
    assert isinstance(args, dict)
    assert "source" in args
    assert isinstance(args["source"], dict)

    for name, detail in args["source"].items():

        # Insert the frame into the args for backward
        # compatability.
        if (
            ("type" in detail)
            and ("frametype" not in detail)
            and (detail["type"] == "table")
        ):
            detail["frametype"] = "pandas"

        if ("frametype" not in detail) or (detail["frametype"] != "pandas"):
            logger.error(
                "Invalid configuration. Only pandas table source supported by this source transform",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        if (
            ("filename" not in detail)
            or (not isinstance(detail["filename"], str))
            or ("params" not in detail)
            or (not isinstance(detail["params"], dict))
        ):
            logger.error(
                "Invalid args. Filename (string) and params (dict) are required",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Invalid configuration")

        mapping = {
            "str": str,
            "float": np.float64,
            "float64": np.float64,
            "np.float64": np.float64,
            "np.int64": np.int64,
            "int": np.int64,
            "int64": np.int64,
            "datetime": datetime,
            "category": "category",
        }

        # => Materialize the path...
        detail["filename"] = self.config.get_file(detail["filename"])
        detail["root"] = self.config.enrich_data_dir
        params = detail["params"]
        if "dtype" in params:
            for attr in params["dtype"]:
                if params["dtype"][attr] in mapping:
                    params["dtype"][attr] = mapping[params["dtype"][attr]]
                else:
                    params["dtype"][attr] = eval(params["dtype"][attr])

    return args
process(state)

Load file...

Source code in enrichsdk/contrib/transforms/tablesource/__init__.py
def process(self, state):
    """
    Load file...
    """
    # Load all the dataframes. This will use the full enrich
    # deployment's beefed up read function.
    framecls = self.config.get_dataframe("pandas")
    source = self.args["source"]

    dfstates = framecls.read(source, {})
    for dfname, dfstate in dfstates.items():

        # => Insert column description
        columns = dfstate["params"]["columns"]
        for c in columns:
            columns[c]["description"] = self.get_column_description(dfname, c)

        params = dfstate["params"]
        if "filename" in params:
            filename = params["filename"]
        elif "filename" in source.get(dfname, {}):
            filename = self.args[dfname]["filename"]
        else:
            filename = "Unknown"

        detail = {
            "df": dfstate["df"],
            "transform": self.name,
            "frametype": "pandas",
            "params": [
                params,
                {
                    "type": "lineage",
                    "transform": self.name,
                    "dependencies": [
                        {"type": "file", "nature": "input", "objects": [filename]}
                    ],
                },
            ],
            "history": [
                {
                    "transform": self.name,
                    "log": "Read data using {}".format(framecls.__class__.__name__),
                }
            ],
        }
        try:
            state.update_frame(dfname, detail, create=True)
        except:
            logger.exception(
                "Unable to store state",
                extra=self.config.get_extra({"transform": self.name}),
            )
            raise Exception("Error while loading")

    # Clean the uploaded data...
    self.clean(state)

enrichsdk.contrib.lib.transforms

AnomaliesBase(*args, **kwargs)

Bases: Compute

Compute anomalies given a dataframe with columns

Features of transform baseclass include:

* Flexible configuration
* Highlevel specification of columns combinations and detection strategy
Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "AnomaliesBase"
    self.description = "Compute anomalies in column(s) of a dataframe"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

get_dataset_s3(spec, paths)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def get_dataset_s3(self, spec, paths):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = config['dataset']

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path)
        if _df is None:
            msg += f"Path not found, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [paths],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

get_profile()

Read the profile json from API

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def get_profile(self):
    """
    Read the profile json from API
    """

    if (not hasattr(self, "args")):
        raise Exception(
            "'args' transform attribute should be defined to use default get_profile method"
        )
    for p in ['apicred']:
        if self.args.get(p) == None:
            raise Exception(
                f"'{p}' attribute in args should be defined to use default get_profile method"
                )

    # call the API to get the anomaly specs
    anomalyspecs, is_valid, msg = load_profile_api(self.args)
    logger.debug(
        f"Loading profile from API",
        extra={"transform": self.name, "data": msg},
    )
    if is_valid == False:
        raise Exception(f"Error loading profile")

    specs = anomalyspecs["specs"]
    logger.debug(
        f"Found {len(specs)} specs",
        extra={"transform": self.name, "data": json.dumps(anomalyspecs, indent=4)},
    )

    return anomalyspecs

preprocess_spec(spec)

to be overloaded in the derived class

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def preprocess_spec(self, spec):
    '''
    to be overloaded in the derived class
    '''
    return spec

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the anomaly profile
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.outliersv2")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and generate anomaly reports
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source_id"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break

        if not do_process_spec:
            continue

        ## pre-process the spec
        try:
            spec = self.preprocess_spec(spec)
            logger.debug(f"Preproccessed spec: {spec['name']}",
                         extra={
                             'transform': self.name,
                             'data': json.dumps(spec, indent=4)
                         })

            ## we can now proceed with processing the spec
            # frist, load the source data
            data = self.load_dataset(spec)

            ## process the spec to detect outliers
            data = self.process_spec(spec, data)

            if ((not isinstance(data, dict)) or
                (len(data) == 0)):
                continue

            # write the detected outliers
            self.store_result(spec, data)
        except:
            logger.exception(f"Failed to process {name}",
                             extra={
                                 'transform': self.name
                             })

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

process_spec_default(data, spec)

Handle one specification at a time..

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def process_spec_default(self, data, spec):
    """
    Handle one specification at a time..
    """

    partialsamplerate = 0.05
    samplerate_lut = {
        "all": 1.0,
        "partial": partialsamplerate,
        "none": 0.0
    }
    tolerances = {
        "low": 1,
        "medium": 2,
        "high": 3,
    }

    def anomaly_note(row, threshold):
        distance = row[f"__anomaly_distance__"]
        if distance > threshold:
            return f"{(round(distance/threshold,2))}x outside expected sample deviation"
        return f"within expected sample deviation"


    msg = ""
    msg += f"Using default centroid distance anomaly detector" + "\n"

    config = spec["config"]
    msg += f"Config: {json.dumps(config, indent=4)}" + "\n"

    # Get hold of the data first...
    name = spec["name"]
    orig_df = data
    total_samples = len(orig_df)

    metrics     = config.get("metrics", orig_df.columns)
    groups      = config.get('groups', [])
    outputs     = config.get("outputs", orig_df.columns)
    dimensions  = config.get("dimensions", orig_df.columns)
    columns     = list(set(metrics + outputs + dimensions))

    msg += f"Combined set of columns: {columns}" + "\n"
    msg += f"{note(orig_df, 'Original DF')}" + "\n"

    #########
    # default anomaly detection
    #########
    # get tolerance thresold
    tolerance = config.get("threshold", config.get("thresold", "medium"))
    scalefactor = tolerances.get(tolerance, 2)

    # get the sample strategy for the normal data
    normal_samples = config.get("normal_samples", "partial")
    samplerate = samplerate_lut[normal_samples]

    msg += f"(tolerance, scalefactor): ({tolerance}, {scalefactor})" + "\n"

    logger.debug(f"Setting up for spec: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })

    anomaly_stats = {}
    plotdata = {}
    dfs = []

    #########
    # we first do the leaf level, per metric to check for anomalies
    #########
    msg = f"Processing metrics: {metrics}" + "\n\n"

    for metric in metrics:

        # make a copy of the df, we'll keep adding anomlay metrics to it
        df = orig_df[columns].copy()

        if not is_numeric_dtype(df[metric]):
            msg += f"{metric} Metric not numeric. Skipping\n"
            continue

        # compute the anomalies for this metric
        points      = df[metric].to_numpy()     # all data as an MxN matrix
        centroid    = df[metric].mean()          # the computed centroid of the dataset
        distances   = abs(points - centroid)    # distances of each point to centroid
        stddev      = np.nanstd(points)      # std dev of distances
        threshold   = stddev * scalefactor
        anomalies   = np.where(distances.flatten()>threshold, 'anomaly', 'normal')    # flag where anomalies occur

        # add columns indicating anomaly label
        id = f"metric-{metric}"
        df['id'] = id
        df['level'] = 'metric'
        df['name'] = metric
        df['__is_anomaly__'] = pd.Series(anomalies)

        # add columns indicating reason for anomaly
        df[f"__anomaly_distance__"] = pd.Series(distances.flatten())
        df[f"__anomaly_note__"] = df.apply(lambda x: anomaly_note(x, threshold), axis=1)

        df_a = df[df['__is_anomaly__']=='anomaly']
        n_anomalies = len(df_a)
        perc_anomalies = round(n_anomalies/total_samples*100, 2)

        df_n = df[df['__is_anomaly__']=='normal'].sample(frac=samplerate)
        df_n = df_n[0:min(3*n_anomalies,len(df_n))] # min 3x n_anomalies or configured sample of normal samples
        n_nsamples = len(df_n)

        # for this metric, we now have all the detected anomalies and the sampled normal data
        sampled_df = pd.concat([df_a, df_n])

        msg += f"--------------------------" + "\n"
        msg += f"Metric: {metric}" + "\n"
        msg += f"Computed stddev: {stddev}" + "\n"
        msg += f"Threshold: {threshold}" + "\n"
        msg += f"Anomalies: {n_anomalies}/{total_samples}={perc_anomalies}%" + "\n"
        msg += f"--------------------------" + "\n\n"

        anomaly_stats[id] = {
            "level": 'metric',
            "name": metric,
            "dimensions": dimensions,
            "n_anomalies": n_anomalies,
            "perc_anomalies": perc_anomalies,
            "n_normalsamples": n_nsamples,
            "n_plotsamples": len(df),
        }
        plotdata[id] = df

        dfs.append(sampled_df)

    logger.debug(f"Processed metrics level: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })


    # #########
    # # then we do the group level, hierarchial
    # #########
    msg = f"Processing groups: {groups}" + "\n\n"

    for group in groups:
        group_name      = group.get('group')
        g_dimensions    = group.get('dimensions', dimensions)
        g_metrics       = group.get('metrics')

        # we don't have what we need, skip
        if group_name == None or metrics == None:
            continue

        if not all([is_numeric_dtype(df[metric]) for metric in g_metrics]):
            msg += f"{group_name} One or more metrics are not numeric\n"
            continue

        # make a copy of the df, we'll keep adding anomlay metrics to it
        df = orig_df[columns].copy()

        points      = df[g_metrics].to_numpy()    # all data as an MxN matrix
        centroid    = df[g_metrics].mean().values # the computed centroid of the dataset
        distances   = distance.cdist(points, np.array([centroid]), 'euclidean') # distances of each point to centroid
        distances   = np.reshape(distances, len(distances))
        stddev      = np.nanstd(points)         # std dev of distances
        threshold   = stddev * scalefactor
        anomalies   = np.where(distances.flatten()>threshold, 'anomaly', 'normal')    # flag where anomalies occur

        # add columns indicating anomaly label
        id = f"group-{group_name}"
        df['id'] = id
        df['level'] = 'group'
        df['name'] = group_name
        df['__is_anomaly__'] = pd.Series(anomalies)

        # add columns indicating reason for anomaly
        df[f"__anomaly_distance__"] = pd.Series(distances.flatten())
        df[f"__anomaly_note__"] = df.apply(lambda x: anomaly_note(x, threshold), axis=1)

        df_a = df[df['__is_anomaly__']=='anomaly']
        n_anomalies = len(df_a)
        perc_anomalies = round(n_anomalies/total_samples*100, 2)

        df_n = df[df['__is_anomaly__']=='normal'].sample(frac=samplerate)
        df_n = df_n[0:min(3*n_anomalies,len(df_n))] # min 3x n_anomalies or configured sample of normal samples
        n_nsamples = len(df_n)

        # for this metric, we now have all the detected anomalies and the sampled normal data
        sampled_df = pd.concat([df_a, df_n])

        msg += f"--------------------------" + "\n"
        msg += f"Group: {group_name}" + "\n"
        msg += f"Computed stddev: {stddev}" + "\n"
        msg += f"Threshold: {threshold}" + "\n"
        msg += f"Anomalies: {n_anomalies}/{total_samples}={perc_anomalies}%" + "\n"
        msg += f"--------------------------" + "\n"

        anomaly_stats[id] = {
            "level": 'group',
            "name": group_name,
            "metrics": g_metrics,
            "dimensions": g_dimensions,
            "threshold": threshold,
            "n_anomalies": n_anomalies,
            "perc_anomalies": perc_anomalies,
            "n_normalsamples": n_nsamples,
            "n_plotsamples": len(df),
        }
        plotdata[id] = df

        dfs.append(sampled_df)

    logger.debug(f"Processed groups level: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })

    if len(dfs) == 0:
        logger.debug(f"{name}: No outputs computed",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })
        return None

    #########
    # construct the DF for output
    #########
    # concat for all metrics+groups
    df = pd.concat(dfs)
    # reorder columns
    first_cols = ['id', 'level', 'name']
    cols = first_cols + [c for c in df.columns if c not in first_cols]
    df = df[cols]

    msg = f"Final columns: {df.columns}" + "\n"

    window, start_date, end_date = self.get_window_dates(config, self.args)

    # compute stats of interest
    stats = {
        "timestamp": f"{datetime.now().isoformat()}",
        "policy": config,
        "data_start_date": f"{start_date}",
        "data_end_date": f"{end_date}",
        "strategy": "centroid",
        "tolerance": tolerance,
        "scalefactor": scalefactor,
        "normalsamples": normal_samples,
        "samplerate": samplerate,
        "n_rows": total_samples,
        "anomaly_stats": anomaly_stats,
    }

    msg += f"Stats: {json.dumps(stats, indent=4)}" + "\n"

    msg += f"{note(df, 'Anomaly DF')}" + "\n"

    logger.debug(f"Completed spec: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })

    return {name: df, "stats": stats, "plotdata": plotdata}

ChangePointDetectorBase(*args, **kwargs)

Bases: Compute

Take a timeseries signal and identify changepoints in the signal

Features of transform baseclass include: * Flexible configuration * Highlevel specification of change point detection: * specified data source or custom method to generate one * generic change point detection method or custom defined ones

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "ChangePointDetectorBase"
    self.description = "Change point(s) detection for a timeseries signal given a spec"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

    self.cautions = {
        "low": {"color": "green", "desc": "MINIMAL", "rec": "In a BUSINESS-AS-USUAL regime now."},
        "medium": {"color": "gold", "desc": "LOW to MODERATE", "rec": "Expect LOW to MODERATE swings in this regime."},
        "high": {"color": "red", "desc": "HIGH to EXTREME", "rec": "Stay alert for HIGH to EXTREME swings in this regime."},
    }

    self.epoch = time.time()    #for output path

get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.changepoint")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source", "detector"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec
        source = config['source']
        for f in ["indicator", "observations"]:
            if f not in source:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        # expections have occured. continue to next spec
        if do_process_spec == False:
            continue

        # get time_window for indicator and observation data set
        datewindow = self.get_datewindow(source, spec)
        if datewindow is None :
            do_process_spec = False
            continue

        # first, load the indicator dataset
        data = {}
        data['indicator'] = self.load_dataset(spec, 'indicator', source['indicator'], datewindow)
        if data['indicator'] is None:
            do_process_spec = False
            continue

        # then, load all the observations datasets
        data['observations'] = {}
        for ob_dataset, dataset in source['observations'].items():
            data['observations'][ob_dataset] = self.load_dataset(spec, ob_dataset, dataset, datewindow)

        # then, process it
        result = self.process_spec(spec, data)
        if result is None:
            continue

        ## store the expectation validation result
        self.store_result(spec, result)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

process_spec_default(spec, data)

Run the default change point detection

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def process_spec_default(self, spec, data):
    """
    Run the default change point detection
    """

    ## note: we need to save viz objects immediately after creation
    ## or they will be overwritten when the next viz object is created

    msg = ""
    name = spec['name']

    indicator_ts = data['indicator']
    cpd = self.run_changepoint_detector(indicator_ts, name)
    changepoints = cpd['changepoints']

    # save changepoint visualization
    viz = cpd['viz']
    filename = f"{name}-changepoints.png"
    l_msg = self.store_viz(spec, filename, viz)
    msg += l_msg
    cpd.pop('viz', None)

    observations = {}
    for observation, observation_ts in data['observations'].items():
        regimes = self.compute_regimes(indicator_ts, observation_ts, changepoints)
        viz = self.visualize_regimes(observation_ts, regimes)
        observations[observation] = {
            "regimes": regimes,
            "viz": viz
        }
        # save regimes visualization
        filename = f"{name}-regime-{observation}.png"
        l_msg = self.store_viz(spec, filename, viz)
        msg += l_msg
        observations[observation].pop('viz', None)

    logger.debug(
        f"Saved visualizations",
        extra={"transform": self.name, "data": msg}
    )

    result = {
        "changepoints": cpd,
        "observations": observations
    }

    return result

ClassifierBase(*args, **kwargs)

Bases: Compute

Take a training dataset and one or more eval datasets Builds a classification model using the training dataset Applies the model on the eval dataset(s) and generates predictions

Features of transform baseclass include: * Flexible configuration * Highlevel specification of steps in ML classification flow: * specify multiple datasets (one for training, one or more for evaluation) * specify optional dataset prep methods * specify training model details with support for imbalanced datasets * specify evaluation strategy on one or more datasets

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "ClassifierBase"
    self.description = "Classification of data using a trained ML model"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

    self.epoch = time.time()    #for output path

do_training(profilespec, modelspec, X, y, model, cv, metric)

Train a model given a dataset and a pipeline

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def do_training(self, profilespec, modelspec, X, y, model, cv, metric):
    """
    Train a model given a dataset and a pipeline
    """

    msg = ""

    name = modelspec['name']

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots(figsize=(9, 9))
    for fold, (train, test) in enumerate(cv.split(X, y)):
        model.fit(X[train], y[train])
        viz = RocCurveDisplay.from_estimator(
            model,
            X[test],
            y[test],
            name=f"ROC fold {fold}",
            alpha=0.3,
            lw=1,
            ax=ax,
        )
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")

    # get the final model fit
    model.fit(X, y)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(
        mean_fpr,
        mean_tpr,
        color="b",
        label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
        lw=2,
        alpha=0.8,
    )

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(
        mean_fpr,
        tprs_lower,
        tprs_upper,
        color="grey",
        alpha=0.2,
        label=r"$\pm$ 1 std. dev.",
    )

    ax.set(
        xlim=[-0.05, 1.05],
        ylim=[-0.05, 1.05],
        xlabel="False Positive Rate",
        ylabel="True Positive Rate",
        title=f"[{name}] Mean ROC curve with variability')",
    )
    ax.axis("square")
    ax.legend(loc="lower right", fontsize=16)
    plt.tight_layout()

    # save training visualization
    filename = f"train-{name}-roc.png"
    l_msg = self.store_viz(profilespec, filename, plt)

    msg += l_msg

    # return the appropriate metric
    if metric == "auc":
        metric_val = mean_auc
    elif metric == "tpr":
        metric_val = mean_tpr
    elif metric == "fpr":
        metric_val = mean_fpr
    else:
        metric_val = mean_auc

    classifier = {
        "model": model,
        "metric": metric_val
    }

    return classifier, msg

get_classifier_pipeline(model)

Construct the classifier pipeline 1. resampling 2. classifier model

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def get_classifier_pipeline(self, model):
    """
    Construct the classifier pipeline
        1. resampling
        2. classifier model
    """

    # do we need to resample the data
    # supports upsampling the minority class for now
    resample = model.get("resample")
    if resample == 'random':
        resampler = RandomOverSampler()
    elif resample == 'smote':
        resampler = SMOTE()
    else:
        # no resampling by default
        resampler = None

    # then get the classifier algorithm
    algorithm = model.get("model", {}).get("algorithm")
    params = model.get("model", {}).get("params", {})
    if algorithm == "knn":
        classifier = KNeighborsClassifier(**params)
    elif algorithm == "svm":
        classifier = svm.SVC(**params)
    else:
        # use the kNN algorithm by default
        classifier = KNeighborsClassifier(n_neighbors=3)

    # construct the pipeline
    if resampler == None:
        pipeline = classifier
    else:
        pipeline = make_pipeline(resampler, classifier)

    return pipeline

get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

load_sources(profilespec)

Load all the data sources

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def load_sources(self, profilespec):
    """
    Load all the data sources
    """
    data = {}

    for source in profilespec.get('sources', []):
        name = source.get('name', 'NOT_SPECIFIED')

        # check for all fields needed
        if any(p not in source for p in ['nature', 'name', 'filename', 'stage']):
            logger.error(f"Malformed source [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(source, indent=4)
                         }))
            continue

        if source['nature'] == 'disk':
            filename = source['filename']
            filename = f"{self.args['root']}/{filename}"
            df = pd.read_csv(filename)
            if df is None:
                logger.error(f"Source not found [{name}]",
                             extra=self.config.get_extra({
                                 'transform': self.name,
                                 'data': json.dumps(source, indent=4)
                             }))
                continue

            data[name] = df

        else:
            continue

        self.update_state(name, df, f"Source: {name}")

    # we have loaded all available data sources
    return data

make_predictions(profilespec, data, classifiers, artifacts)

Generate predictions for the various eval datasets

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def make_predictions(self, profilespec, data, classifiers, artifacts):
    """
    Generate predictions for the various eval datasets
    """

    # to collect all the results
    results = {}

    # for each prediction spec in the profilespec
    for spec in profilespec.get('predict', []):
        # process it
        name = spec['name']
        if spec.get('enable', True) == False:
            logger.error(f"Spec [{name}] disabled, skipping",
                         extra=self.config.get_extra({
                             'transform': self.name
                         }))
            continue

        _dfs = []
        for source in spec.get('sources', []):
            _dfs.append(data[source])
        if len(_dfs)>0:
            eval_df = pd.concat(_dfs)
        else:
            logger.error(f"No sources to eval, skipping",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(spec, indent=4)
                         }))
            continue

        # get the target column name
        target = spec.get("target")
        ignore = spec.get("ignore", [])
        if target == None:
            logger.error(f"Target column not specified, skipping eval [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(spec, indent=4)
                         }))
            continue

        # we now have eval_df
        # check if a prep data method is specified
        prep_data = profilespec.get("prep", {}).get("method")
        if prep_data != None:
            if hasattr(self, prep_data):
                handler = getattr(self, prep_data)
                prepped_eval_df, artifacts, msg = handler(eval_df, artifacts, 'predict')

            logger.debug(f"Prepped eval data [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': msg
                         }))

        # check if all required columns are present
        missing_cols = set(artifacts['columns']).difference(set(prepped_eval_df.columns))
        for c in missing_cols:
            prepped_eval_df[c] = 0
        logger.debug(f"Added missing columns [{name}]",
                     extra=self.config.get_extra({
                         'transform': self.name,
                         'data': f"Missing cols: {missing_cols}"
                     }))


        # we now have the prepped eval df
        # run the specified classifier on it
        classifier_name = spec.get("model", "best")
        if classifier_name == "best":
            classifier_name = classifiers["best"]

        classifier = classifiers[classifier_name]['model']

        # create the data arrays
        X = prepped_eval_df[[c for c in prepped_eval_df.columns if c not in [target]+ignore]].to_numpy()

        # make the predictions
        r = classifier.predict(X)
        eval_df["__prediction"] = pd.Series(r)

        result = {
            "spec": spec,
            "n_datapoints": len(eval_df),
            "n_predictions": eval_df["__prediction"].value_counts().to_dict()
        }
        results[name] = result

        logger.debug(f"Predictions done [{name}]",
                     extra=self.config.get_extra({
                         'transform': self.name,
                         'data': note(eval_df, f"Predictions [{name}]")
                     }))

        # store results data csv
        self.store_result_data(profilespec, spec, result, eval_df)

    return results

prep_data(profilespec, data, artifacts)

Do any data prep needed We may need to do data scaling, normalization, etc. here Any artifacts of the prep that will be needed by the prediction stage must be returned in this function

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def prep_data(self, profilespec, data, artifacts):
    """
    Do any data prep needed
    We may need to do data scaling, normalization, etc. here
    Any artifacts of the prep that will be needed by the
    prediction stage must be returned in this function
    """

    # setup the training data and artifacts
    train_data = None
    for source in profilespec['sources']:
        if source['stage'] == "train":
            train_data = source['name']
    if train_data == None:
        return data, artifacts

    # check if a prep data method is specified
    prep_data = profilespec.get("prep", {}).get("method")
    if prep_data == None:
        return data, artifacts

    # call the prep data method
    msg = ""
    if hasattr(self, prep_data):
        handler = getattr(self, prep_data)
        data[train_data], artifacts, msg = handler(data[train_data], artifacts, 'train')

    logger.debug(f"Prepped training data [{train_data}]",
                 extra=self.config.get_extra({
                     'transform': self.name,
                     'data': msg
                 }))

    return data, artifacts

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug("Start execution",
                 extra=self.config.get_extra({
                     'transform': self.name
                 }))
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.classifier")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ###
        # first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "sources", "train", "predict"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ###
        # get all the data sources
        ##############
        # Re-write to use standard load_dataset(...) method
        # when using API-based spec and s3 data sources
        data = self.load_sources(spec)
        ##############
        if len(data) == 0:
            logger.exception("No datasources found, failing",
                         extra=self.config.get_extra({
                             'transform': self.name
                         }))
            raise Exception("No datasources")

        ###
        # model training stage
        classifiers, artifacts = self.train_models(spec, data)

        ###
        # make predictions for each evaluation dataset
        # and store results
        results = self.make_predictions(spec, data, classifiers, artifacts)

        # Store the metadata with results
        self.store_metadata(spec, results)

    # Done
    logger.debug("Complete execution",
                 extra=self.config.get_extra({
                     'transform': self.name
                 }))

    ###########################################
    # => Return
    ###########################################
    return state

store_metadata(spec, results)

Store all the metadata for the full run

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def store_metadata(self, spec, results):
    """
    Store all the metadata for the full run
    """
    metadata = self.get_default_metadata(self.state)
    metadata['spec'] = spec
    metadata['results'] = results

    store = spec.get("store", ["disk"])

    if "s3" in store:
        # store in s3
        appname     = spec.get('app',self.name)
        name        = spec['name']
        namespace   = spec.get('namespace', 'default')
        run_date    = self.args['run_date']
        s3          = self.args['s3']
        epoch       = self.epoch

        # where are we storing it?
        targetdir = os.path.join(self.args['s3root'], f"{appname}/{namespace}/{name}/{run_date}/{epoch}")
        metadatafile = os.path.join(targetdir, f"metadata.json")

        # write to s3
        with s3.open(metadatafile, 'w') as fd:
            json.dump(metadata, fd, indent=4, cls=SafeEncoder)
    if "db" in store:
        # store in db
        self.db_store_metadata(spec, predictspec, result, df)
    if "disk" in store:
        # store in disk
        name = spec['name']
        outfile = os.path.join(self.args['output'], f"{name}/metadata.json")
        with open(outfile, 'w') as fd:
            fd.write(json.dumps(metadata,indent=4))

train_models(profilespec, data)

Model training

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def train_models(self, profilespec, data):
    """
    Model training
    """
    msg = ""

    # prep the training data
    # and generate any artifacts needed later
    artifacts = {}
    data, artifacts = self.prep_data(profilespec, data, artifacts)
    # we need a list of all columns which will be used in training
    for source in profilespec['sources']:
        if source['stage'] == "train":
            train_data = source['name']
            artifacts['columns'] = list(data[train_data].columns)

    # required params
    trainspec = profilespec.get("train")
    metric  = trainspec.get("metric", "auc") #what is the metric against which to compare models
    folds   = trainspec.get("folds", 1)      #how many folds for cross validation

    classifiers = {}

    # for each model to train
    models = trainspec.get("models", [])
    for model in models:
        if model.get("enable", True) == False:
            continue

        name    = model.get("name", f"{hashlib.md5(json.dumps(model).encode('utf-8')).hexdigest()}")
        model['name'] = name
        dataset = model.get("source")
        target  = model.get("target")
        ignore  = model.get("ignore", [])

        if dataset == None or dataset not in data:
            logger.error(f"Dataset not known, skipping training [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(model, indent=4)
                         }))
            continue
        if target == None:
            logger.error(f"Target column not specified, skipping training [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(model, indent=4)
                         }))
            continue

        msg += f"Model: {name}" + "\n"
        msg += f"Dataset: {dataset}" + "\n"
        msg += f"Target column: {target}" + "\n"
        msg += f"Ignore columns: {ignore}" + "\n"

        df = data[dataset]

        # create the data arrays
        X = df[[c for c in df.columns if c not in [target]+ignore]].to_numpy()
        y = df[target].to_numpy()

        msg += f"Size (X): {X.size}" + "\n"
        msg += f"Size (y): {y.size}" + "\n"

        # figure out the minority class
        # in case we need to resample
        class_distribution = pd.Series(y).value_counts(normalize=True)
        pos_label = class_distribution.idxmin()
        msg += f"Positive label: {pos_label}" + "\n"

        # construct the classifier pipeline object
        classifier_pipeline = self.get_classifier_pipeline(model)

        # set up the n-fold cross validation
        cv = StratifiedKFold(n_splits=folds)

        # do model training
        classifiers[name], l_msg = self.do_training(profilespec, model, X, y, classifier_pipeline, cv, metric)
        msg += l_msg

    # decide on what the best classifier is based on the metric
    classifiers['best'] = self.decide_best_classifier(classifiers)

    msg += f"Classifiers: {json.dumps(classifiers, indent=4, cls=SafeEncoder)}" + "\n"
    msg += f"Artifacts: {json.dumps(artifacts, indent=4, cls=SafeEncoder)}" + "\n"

    logger.debug(f"Completed training",
                 extra=self.config.get_extra({
                     'transform': self.name,
                     'data': msg
                 }))

    return classifiers, artifacts

DataObserverBase(*args, **kwargs)

Bases: Compute

Monitor an input data source given a spec

Features of transform baseclass include: * Flexible configuration * Highlevel specification of observability: * specified data source * custom defined testing conditions for observability * custom defined output of observability results * notification of observability results on success/failure

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "DataObserverBase"
    self.description = "Monitor an input data source given a spec"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

get_dataset_s3(spec)

Use the dataset object to read the dataset

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def get_dataset_s3(self, spec):
    """
    Use the dataset object to read the dataset
    """
    run_date    = self.args['run_date']
    name        = spec["name"]
    config      = spec['config']
    source      = config['source']

    for f in ["dataset", "filename"]:
        if f not in source:
            msg = f"{f} param needed in config source" + "\n"
            logger.exception(
                f"Dataset: {name} -- skipping", extra={"transform": self.name, "data": msg}
            )
            return None

    dataset_type    = source['type']
    dataset         = source['dataset']
    pieces          = dataset.split('-')
    dataset_main    = "-".join(pieces[:-1])
    dataset_subset  = pieces[-1]
    filename        = source["filename"]
    params          = source.get("params", {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-anonymizer-cache-" + cachename + ".csv"

    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            msg += note(df, f"Cached {dataset}") + "\n"
            logger.debug(f"Read cached {name}", extra={"transform": self.name, "data": msg})
            return df

    if dataset_type == "registry":
        if not hasattr(self, "get_dataset"):
            raise Exception(
                "get_dataset_s3 expects get_dataset method"
            )
        datasetobj = self.get_dataset(dataset_main) # this method should be defined in the derived class

        if hasattr(self, 'update_doodle'):
            self.update_doodle(datasetobj, source['filename'])

        df, metadata = datasetobj.read_data(
            run_date,
            run_date,
            filename=filename,
            readfunc=self.read_s3_data,
            params=params,
        )
    elif dataset_type == "direct":
        df = self.read_s3_data(filename, params)
        metadata = { "files": [filename] }
    else:
        logger.exception(
            f"Unknown source param: {dataset_type}, skipping", extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
        )
        return None

    msg = note(df, f"Fresh {dataset}") + "\n"
    logger.debug(f"Read fresh {name}", extra={"transform": self.name, "data": msg})

    # Cache it for future use...
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if ("files" in metadata) and (len(metadata["files"]) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [metadata["files"][-1]],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "observability")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # get the dataset lookup table
    customer_datasets = profilespec.construct_dataset_list(self, specs)

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("enable", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source", "checks", "store"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue


        ## we can now proceed with processing the spec
        # frist, load the source data
        data = self.load_dataset(spec, customer_datasets)

        # then, process it
        results = self.process_spec(spec, data)
        if results is None:
            continue

        ## notify the observability result
        results = self.notify_result(spec, results, data)

        ## store the observability result and notification status
        self.store_result(spec, results, data)

        # update frame for pipline
        description = spec.get("desc", f"{name} observability results")
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [spec.get("filename", "__NEW__")],
                },
            ],
        }
        self.update_frame(
            spec,
            description,
            results,
            lineage,
        )

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

DataSanitizerBase(*args, **kwargs)

Bases: Compute

Sanitize data based on rules.

Features of transform baseclass include: * Flexible configuration * Highlevel specification of transformations * specified data source * custom defined rules

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "DataSanitizerBase"
    self.description = "Sanitize data based on rules"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec['name'] + "-raw", f"Raw Dataset: {spec['name']}", df, lineage)

    return df    

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra={"transform": self.name}
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.dataqualityv2")
    if is_valid:
        name = profile.get('name', 'unknown')
        logger.debug(
            f"Loaded profilespec: {name}",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4, cls=SafeEncoder)}
            )
            do_process_spec = False
            continue

        for f in ["name", "source", "transformations"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4, cls=SafeEncoder)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec            
        source = spec['source']

        # get time_window for indicator and observation data set
        datewindow = self.get_datewindow(source, spec)
        if datewindow is None :
            do_process_spec = False
            continue

        ## we can now proceed with processing the spec
        # first, load the source data
        data = self.load_dataset(spec, source, datewindow)

        # then, process it
        result, msg = self.process_spec(spec, data)
        if result is None:
            continue

        ## store the expectation validation result
        self.store_result(spec, result, {'notes': msg})


    # Done
    logger.debug(
        "Complete execution", extra={"transform": self.name}
    )

    ###########################################
    # => Return
    ###########################################
    return state

FeatureComputeBase(*args, **kwargs)

Bases: Compute

A built-in transform baseclass to handle standard feature computation and reduce the duplication of code.

This should be used in conjunction with an FeaturesetExtractor & FeatureExtractor

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "FeatureComputeBase"
    self._environ = os.environ.copy()

get_featureset_extractors()

Get all the featureset extractors (not feature extractors)

Returns: list: A list of extractors as a name, extractor combination

For example::

return [{
     "name": "patient",
     "extractor": <featureset extractor instance>
}]
Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def get_featureset_extractors(self):
    """
    Get all the *featureset* extractors (not feature extractors)

    Returns:
         list: A list of extractors as a name, extractor combination

    For example::

        return [{
             "name": "patient",
             "extractor": <featureset extractor instance>
        }]

    """
    raise Exception("Implement in subclass")

get_objects()

Get a list of objects (typically names) to process. Could be dictionaries, lists etc. The list is not interpreted by the base class. Could be a list of identifier.

Returns: list: A list of objects (could be ids/paths/dicts etc.)

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def get_objects(self):
    """
    Get a list of objects (typically names)  to process. Could be dictionaries,
    lists etc. The list is not interpreted by the base class. Could be a list of
    identifier.

    Returns:
       list: A list of objects (could be ids/paths/dicts etc.)

    """
    if "root" not in args:
        raise Exception("Base class implementation required 'root'")

    root = self.args["root"]
    files = os.listdir(root)
    return files

instantiable() classmethod

Return true if class can be instantiated. Override in subclass

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
@classmethod
def instantiable(cls):
    """
    Return true if class can be instantiated. Override in subclass
    """
    return False

process(state)

Core loop

Rough logic::

get featureset extractors
get objects
for each object:
     for each featureset extractor X
         process one object with X
         collect one featureset 'row' for X

for each featureset extractor X
Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def process(self, state):
    """
    Core loop

    Rough logic::

        get featureset extractors
        get objects
        for each object:
             for each featureset extractor X
                 process one object with X
                 collect one featureset 'row' for X

        for each featureset extractor X

    """

    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    self.state = state

    # What extractors to run on the data..
    featureset_extractors = self.get_featureset_extractors()
    featureset_extractors = [
        f for f in featureset_extractors if f.get("enable", True)
    ]

    # Go through all the available objects
    objects = self.get_objects()
    logger.debug(f"Received {len(objects)} objects", extra={"transform": self.name})

    # Compute the features..
    final = compute_features(objects, featureset_extractors, self.read_object)

    # Update the frame
    for name, df in final.items():
        if isinstance(df, pd.DataFrame):
            self.update_frame(
                name + "_features",
                "Features computed over the available data",
                df,
                objects[0],
            )

    # Store the result...
    files = self.store(final)

    registry = self.get_registry()
    dataset = registry.find(list(final.keys()))
    metadata = {
        'files': files
    }
    registry.access(dataset, metadata, 'write')

    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

read_object(obj)

Read one object returned by get_objects

Args: obj (object): One item in the list of objects

Returns: object: An object like dict or list of dicts

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def read_object(self, obj):
    """
    Read one object returned by get_objects

    Args:
        obj (object): One item in the list of objects

    Returns:
         object: An object like dict or list of dicts

    """

    if "root" not in args:
        raise Exception("Base class implementation required 'root'")

    root = self.args["root"]
    filename = os.path.join(root, obj)
    data = json.load(open(filename))
    return data

store(data)

Store the final result

Args: data (dict): name of featureset -> data associated with it

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def store(self, data):
    """
    Store the final result

    Args:
        data (dict): name of featureset -> data associated with it

    """
    raise Exception("Implement in subclass")

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

FileBasedQueryExecutorBase(*args, **kwargs)

Bases: Compute

Base class for a File-based QueryExecutor transform. This is useful to run queries against backends such as backends such as mysql

Features of transform baseclass include:

* Support query engines (MySQL, Hive, Presto)
* Support templatized execution
* Support arbitrary number of queries
* Supports a generator function to generate per-interval queries

Configuration looks like::

...
"args": {
    "cleanup": False,
    "force": True,
    "names": "all",
    "start": "2020-08-01",
    "end": "2020-08-03",
}
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "FileBasedQueryExecutorBase"
    self.description = "Execute queries against backends and store in files"
    self.supported_extra_args = [
        {
            "name": "names",
            "description": "names of the queries to execute",
            "default": "all",
            "required": False,
        },
        {
            "name": "force",
            "description": "Force execution",
            "default": "False",
            "required": False,
        },
        {
            "name": "start",
            "description": "Start of the time window",
            "default": "",
            "required": True,
        },
        {
            "name": "end",
            "description": "End of the time window",
            "default": "",
            "required": True,
        },
    ]

    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

generator_daily(spec, specitem, query)

Built-in function to generate a list of dates (one for each day) between two dates.

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def generator_daily(self, spec, specitem, query):
    """
    Built-in function to generate a list of dates (one for each day)
    between two dates.

    """

    start = self.args["start"]
    end = self.args["end"]
    if start > end:
        start, end = end, start

    if isinstance(start, datetime):
        start = start.date()
    if isinstance(end, datetime):
        end = end.date()

    # Pass any extra parameters
    extra = query.get("params", {})
    paramlist = []
    dt = start
    while dt < end:
        params = {"dt": dt.isoformat()}
        params.update(extra)

        dt += relativedelta.relativedelta(days=1)
        paramlist.append(params)

    return paramlist

get_executor(specitem, query, credentials)

Get executor for a specitem and credentials. This executor runs the query.

The executor could be specified within the query, spec, or could default to built-in one based on the credentials and dbtype within.

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute
credentials (dict): Credentials for the backend

Returns:

a callable executor
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_executor(self, specitem, query, credentials):
    """
    Get executor for a specitem and credentials. This executor
    runs the query.

    The executor could be specified within the query,
    spec, or could default to built-in one based on the
    credentials and dbtype within.

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute
        credentials (dict): Credentials for the backend

    Returns:

        a callable executor
    """

    default_executor = None
    if credentials["dbtype"] == "mysql":
        default_executor = self.mysql_executor
    elif credentials["dbtype"] == "hive":
        default_executor = self.hive_executor

    # Executor can be per query or for the entire set
    executor = query.get("executor", specitem.get("executor", default_executor))

    if (executor is not None) and callable(executor):
        return executor

    if (executor is not None) and hasattr(self, executor):
        return getattr(self, executor)

    raise Exception("Cant find executor: {}".format(executor))

get_generator(specitem, query)

Parameters generator. This is useful when a templatized query has to be run against the backend over many days. The output of the generator function is a list of dictionaries each of which is a key-value set for one time window (say a day)

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute

Returns:

a callable generator function
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_generator(self, specitem, query):
    """
    Parameters generator. This is useful when a templatized
    query has to be run against the backend over many
    days. The output of the generator function is a list of
    dictionaries each of which is a key-value set for one
    time window (say a day)

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute

    Returns:

        a callable generator function
    """
    generator = query.get("generator", specitem.get("generator", "generator_daily"))
    if (generator is not None) and callable(generator):
        return generator

    if (generator is not None) and hasattr(self, generator):
        return getattr(self, generator)

    raise Exception("Could not find generator: {}".format(generator))

get_output_handler(query, params)

Find a handler for the output of the query. This function should be over-ridden to compute the handler dynamically.

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_output_handler(self, query, params):
    """
    Find a handler for the output of the query. This function
    should be over-ridden to compute the handler dynamically.

    """
    if "output" not in query:
        raise Exception("Unable to determine output handler. No 'output' in query")

    if isinstance(query["output"], str):
        return FileOutputHandler(self, query["output"])

        raise Exception("Unable to determine output. No 'output' in query")

get_spec()

Get query execution specification. Override this

Returns:

specs (list): A list of dictionaries. Each dictionary specifies name, credentials, queries to run

Example::

return [ { "name": "roomdb", "cred": "roomdb", "queries": [ { "name": "select_star", "output": "%(data_root)s/shared/db/select_star/%(dt)s.tsv", "sql": "%(transform_root)s/SQL/select_star.sql", "params": { "alpha": 22 } } ] }, { "enable": False, "name": "hive", "cred": "hiveserver", "queries": [ { "name": "employees", "output": "%(data_root)s/shared/db/employee/%(dt)s.tsv", "sql": "%(transform_root)s/SQL/employees.hql", } ] } ]

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_spec(self):
    """
    Get query execution specification. Override this

    Returns:

       specs (list): A list of dictionaries. Each dictionary
                     specifies name, credentials, queries to run

    Example::

       return [
           {
               "name": "roomdb",
               "cred": "roomdb",
               "queries": [
                   {
                       "name": "select_star",
                       "output": "%(data_root)s/shared/db/select_star/%(dt)s.tsv",
                       "sql": "%(transform_root)s/SQL/select_star.sql",
                       "params": {
                        "alpha": 22
                       }
                   }
               ]
           },
           {
               "enable": False,
               "name": "hive",
               "cred": "hiveserver",
               "queries": [
                   {
                       "name": "employees",
                       "output": "%(data_root)s/shared/db/employee/%(dt)s.tsv",
                       "sql": "%(transform_root)s/SQL/employees.hql",
                   }
               ]
           }
       ]

    """
    return []

hive_executor(specitem, credentials, query, params)

Built in executor for queries against a hive backend. The output is dumped to a temporary file and then an output handler is called for post-processing.

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute
credentials (dict): Credentials for the backend
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def hive_executor(self, specitem, credentials, query, params):
    """
    Built in executor for queries against a hive backend. The output
    is dumped to a temporary file and then an output handler is called
    for post-processing.

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute
        credentials (dict): Credentials for the backend

    """
    try:

        targetdir = None

        # Should this be forced?
        force = self.args.get("force", False)
        cleanup = self.args.get("cleanup", True)

        # Get the output filename (s3, hdfspath etc.)
        handler = self.get_output_handler(query, params)

        if (not force) and handler.exists(params):
            logger.debug(
                "[Skipping:Exists] {} {}".format(query["name"], params["dt"]),
                extra={"transform": self.name},
            )
            return

        logger.debug(
            "[Computing] {} {}".format(query["name"], params["dt"]),
            extra={"transform": self.name},
        )

        # Create a temp directory
        targetdir = tempfile.mkdtemp(prefix="query_executor_")

        # Process the credentials
        config = get_mysql_config(credentials)

        # Instantiate the sql
        sqlfile = self.get_file(query["sql"])
        if not os.path.exists(sqlfile):
            raise Exception("Invalid sql file: {}".format(sqlfile))
        sql = open(sqlfile).read()

        # Resolve the sql content
        sql = sql.format(**params)

        sqlname = os.path.join(targetdir, "run.sql")
        with open(sqlname, "w") as fd:
            fd.write(sql)

        # => Now write the script
        tmpname = os.path.join(targetdir, "output.tsv")

        cmd = (
            "beeline -u jdbc:hive2://%(host)s:%(port)s --silent=true --verbose=False --outputformat=tsv"
            % config
        )

        if "user" in config:
            cmd += " -n '{}'".format(user)

        if "password" in config:
            cmd += " -p '{}'".format(password)

        # Generate the script to run
        script = """#!/bin/bash\n\n"""
        script += "date\n"
        script += "{} -f {} > {}\n".format(cmd, sqlname, tmpname)
        script += "date\n"
        script += "[ -s {0} ] && sed -i 's/\\r//g' {0}\n".format(tmpname)
        scriptname = os.path.join(targetdir, "run.sh")
        with open(scriptname, "w") as fd:
            fd.write(script)

        try:
            process = subprocess.Popen(
                ["/bin/bash", scriptname],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            out, err = process.communicate()
            logger.debug(
                "Executed the script",
                extra={
                    "transform": self.name,
                    "data": "Output:\n----\n"
                    + out.decode("utf-8")
                    + "\n\nError:\n-----\n"
                    + err.decode("utf-8"),
                },
            )
        except:
            logger.exception(
                "Error while executing the script",
                extra={
                    "transform": self.name,
                },
            )

        # => Now post-process it..
        handler.process(tmpname, params)

    except:
        logger.exception("Failed to execute", extra={"transform": self.name})

    try:
        if cleanup and (targetdir is not None) and os.path.exists(targetdir):
            shutil.rmtree(targetdir)
        else:
            logger.warning(
                "Targetdir not removed",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Targetdir: {}".format(targetdir),
                    }
                ),
            )
    except:
        logger.exception(
            "Cleanup failed",
            extra=self.config.get_extra(
                {"transform": self.name, "data": "Targetdir: {}".format(targetdir)}
            ),
        )

mysql_executor(specitem, credentials, query, params)

Built in executor for queries against a mysql backend. The output is dumped to a temporary file and then an output handler is called for post-processing.

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute
credentials (dict): Credentials for the backend
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def mysql_executor(self, specitem, credentials, query, params):
    """
    Built in executor for queries against a mysql backend. The output
    is dumped to a temporary file and then an output handler is called
    for post-processing.

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute
        credentials (dict): Credentials for the backend

    """
    try:

        targetdir = None

        # Should this be forced?
        force = self.args.get("force", False)
        cleanup = self.args.get("cleanup", True)

        # Get the output filename (s3, hdfspath etc.)
        handler = self.get_output_handler(query, params)

        if (not force) and handler.exists(params):
            logger.debug(
                "[Skipping:Exists] {} {}".format(query["name"], params["dt"]),
                extra={"transform": self.name},
            )
            return

        logger.debug(
            "[Computing] {} {}".format(query["name"], params["dt"]),
            extra={"transform": self.name},
        )

        # Create a temp directory
        targetdir = tempfile.mkdtemp(prefix="query_executor_")

        # Process the credentials
        config = get_mysql_config(credentials)

        # Create the environment file
        cnfname = os.path.join(targetdir, "env.sh")
        with open(cnfname, "w") as fd:
            fd.write("[client]\n")
            for var in ["host", "user", "password"]:
                fd.write("{}={}\n".format(var, config[var]))

        # Instantiate the sql
        sqlfile = self.get_file(query["sql"])
        if not os.path.exists(sqlfile):
            raise Exception("Invalid sql file: {}".format(sqlfile))
        sql = open(sqlfile).read()

        # Resolve the sql content
        sql = sql.format(**params)

        sqlname = os.path.join(targetdir, "run.sql")
        with open(sqlname, "w") as fd:
            fd.write(sql)

        # => Now write the script
        tmpname = os.path.join(targetdir, "output.tsv")

        cmd = "mysql --defaults-extra-file={}".format(cnfname)

        # Generate the script to run
        script = """#!/bin/bash\n\n"""
        script += "date\n"
        script += "{} -B < {} > {}\n".format(cmd, sqlname, tmpname)
        script += "date\n"
        script += "[ -s {0} ] && sed -i 's/\\r//g' {0}\n".format(tmpname)
        scriptname = os.path.join(targetdir, "run.sh")
        with open(scriptname, "w") as fd:
            fd.write(script)

        try:
            process = subprocess.Popen(
                ["/bin/bash", scriptname],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            out, err = process.communicate()
            logger.debug(
                "Executed the script",
                extra={
                    "transform": self.name,
                    "data": "Output:\n----\n"
                    + out.decode("utf-8")
                    + "\n\nError:\n-----\n"
                    + err.decode("utf-8"),
                },
            )
        except:
            logger.exception(
                "Error while executing the script",
                extra={
                    "transform": self.name,
                },
            )

        # => Now post-process it..
        handler.process(tmpname, params)

    except:
        logger.exception("Failed to execute", extra={"transform": self.name})

    try:
        if cleanup and (targetdir is not None) and os.path.exists(targetdir):
            shutil.rmtree(targetdir)
        else:
            logger.warning(
                "Targetdir not removed",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Targetdir: {}".format(targetdir),
                    }
                ),
            )
    except:
        logger.exception(
            "Cleanup failed",
            extra=self.config.get_extra(
                {"transform": self.name, "data": "Targetdir: {}".format(targetdir)}
            ),
        )

preload_clean_args(args)

Check validity of the args

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def preload_clean_args(self, args):
    """
    Check validity of the args
    """
    args = super().preload_clean_args(args)

    if ("start" not in args) or ("end" not in args):
        raise Exception("Start or end of timeframe missing")

    try:
        start = dateparser.parse(args["start"])
        args["start"] = start
        end = dateparser.parse(args["end"])
        args["end"] = end
    except:
        logger.exception("Invalid start or end", extra={"transform": self.name})
        raise Exception("Invalid start/end datetime specified")

    if (
        ("names" not in args)
        or (not isinstance(args["names"], str))
        or (len(args["names"]) == 0)
    ):
        raise Exception("Invalid list of query names specified")

    # Include force
    force = str(args["force"]).lower().strip()
    force = force == "true"
    args["force"] = force

    # Clean the list of names...
    names = args["names"].split(",")
    names = [n.strip() for n in names if len(n.strip()) > 0]
    args["names"] = [n for n in names if len(n) > 0]

    return args

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Starting Query Execution",
        extra=self.config.get_extra({"transform": self.name}),
    )

    start = self.args["start"]
    end = self.args["end"]

    # Get and validate spec
    spec = self.get_spec()
    self.validate_spec(spec)
    try:
        self.process_spec(spec)
    except:
        logger.exception(
            "Failed while processing spec", extra={"transform": self.name}
        )
        raise

    logger.debug(
        "Completed Query Execution",
        extra=self.config.get_extra({"transform": self.name}),
    )
    ###########################################
    # => Return
    ###########################################
    return state

process_spec(spec)

Process query specification

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def process_spec(self, spec):
    """
    Process query specification
    """

    names = self.args["names"]
    for specitem in spec:

        try:

            itemname = specitem["name"]

            enable = specitem.get("enable", True)
            if not enable:
                logger.error(
                    "Skipping: {}. Not enabled.".format(itemname),
                    extra={"transform": self.name},
                )
                continue

            # What should we be executing to begin with..?
            toexecute = []
            for name in names:
                if name in specitem.get("definitions", {}):
                    toexecute.extend(specitem["definitions"][name])

            for q in specitem["queries"]:
                if (name == "all") or (q["name"] == name):
                    toexecute.append(q["name"])

            # Cleanup
            toexecute = list(set(toexecute))

            if len(toexecute) == 0:
                logger.error(
                    "No parameter list generated: {}".format(itemname),
                    extra={"transform": self.name},
                )
                continue

            # Now process the list of queries. Params will be
            # generated per specitem.
            self.process_specitem(spec, specitem, toexecute)

            logger.debug(
                "Completed execution: {}".format(itemname),
                extra={"transform": self.name},
            )
        except:
            logger.exception(
                "Unable to execute: {}".format(itemname),
                extra={"transform": self.name},
            )

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

validate_spec(spec)

Check whether specification is valid

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def validate_spec(self, spec):
    """
    Check whether specification is valid
    """

    if not isinstance(spec, list):
        raise Exception("Query specification should be a list")

    for specitem in spec:

        if not isinstance(specitem, dict):
            logger.error(
                "Query specification items should be dicts",
                extra={
                    "transform": self.name,
                    "data": "Found {}\n{}".format(str(type(specitem)), specitem),
                },
            )
            raise Exception("Invalid specification")

        if "executor" in specitem:
            executor = specitem["executor"]
            if (not callable(executor)) and (
                not (
                    (isinstance(executor, str))
                    and (len(executor) > 0)
                    and (not hasattr(self, executor))
                )
            ):
                raise Exception("Invalid executor specified: {}".format(executor))

        if "generator" in specitem:
            generator = specitem["generator"]
            if (not callable(generator)) and (
                not (
                    (isinstance(generator, str))
                    and (len(generator) > 0)
                    and (not hasattr(self, generator))
                )
            ):
                raise Exception("Invalid generator specified: {}".format(generator))

        expected = ["name", "cred", "queries"]
        missing = [name for name in expected if name not in specitem]
        if len(missing) > 0:
            logger.error(
                "Query specification items should have required elements",
                extra={
                    "transform": self.name,
                    "data": "Missing {} in:\n{}".format(missing, specitem),
                },
            )
            raise Exception("Invalid specification")

        # => check the cred
        try:
            cred = self.get_credentials(specitem["cred"])
        except:
            logger.exception(
                "Unable to find credentials", extra={"transform": self.name}
            )
            raise Exception("Invalid specification")

        for q in specitem["queries"]:
            if (not isinstance(q, dict)) or (len(q) == 0):
                logger.error(
                    "Empty or invalid query specification",
                    extra={"transform": self.name, "data": "Query: {}".format(q)},
                )
                raise Exception("Invalid specification")

            expected = ["name", "sql", "output"]
            missing = [name for name in expected if name not in q]
            if len(missing) > 0:
                logger.error(
                    "Query specification items should have required elements",
                    extra={
                        "transform": self.name,
                        "data": "Missing {} in:\n{}".format(missing, specitem),
                    },
                )
                raise Exception("Invalid specification")

            if not isinstance(q["sql"], str) or len(q["sql"]) == 0:
                logger.error(
                    "Query specification items has invalid sql",
                    extra={
                        "transform": self.name,
                        "data": "SQL: {}".format(q["sql"]),
                    },
                )
                raise Exception("Invalid specification")

            if "generator" in q:
                generator = q["generator"]
                if (not callable(generator)) and (
                    not (
                        (isinstance(generator, str))
                        and (len(generator) > 0)
                        and (not hasattr(self, generator))
                    )
                ):
                    raise Exception(
                        "Invalid generator specified: {}".format(generator)
                    )

        if "definitions" in specitem:
            definitions = specitem["definitions"]
            if (not instance(definitions, dict)) or (len(definitions) == 0):
                logger.error(
                    "Query specification items should have valid definition",
                    extra={
                        "transform": self.name,
                        "data": "Expected non-empty dict. Found:\n{}".format(
                            specitem
                        ),
                    },
                )
                raise Exception("Invalid specification")

            available_names = [q["name"] for q in specitem["queries"]]
            for k, v in definitions.items():
                if (not isinstance(v, list)) or (len(v) == 0):
                    logger.error(
                        "Query specification items should have valid definition",
                        extra={
                            "transform": self.name,
                            "data": "Expected valid non-empty value. Found:\n{}".format(
                                definition
                            ),
                        },
                    )
                    raise Exception("Invalid specification")
                missing = [name for name in v if name not in available_names]
                if len(missing) > 0:
                    logger.error(
                        "Query specification items should have valid definition",
                        extra={
                            "transform": self.name,
                            "data": "Missing: {}\n{}".format(missing, definition),
                        },
                    )
                    raise Exception("Invalid specification")

    # Last check whether the requirements can be satisfied
    names = self.args["names"]

    available_names = ["all"]
    for specitem in spec:
        if "definitions" in specitem:
            available_names.extend(list(specitem["definitions"].keys()))
        for q in specitem["queries"]:
            available_names.append(q["name"])

    missing = [name for name in names if name not in available_names]
    if len(missing) > 0:
        logger.error(
            "Invalid names in args",
            extra={
                "transform": self.name,
                "data": "Missing: {}\nAvailable: {}".format(
                    missing, available_names
                ),
            },
        )
        raise Exception("Invalid specification")

FileOperationsBase(*args, **kwargs)

Bases: Trigger

Base class for a FileOperations transform. For now only one action is supported 'copy'. More actions will be added in future.

Example::

    {
        "transform": "FileOperations",
        "enable": true,
        "dependencies": {
           ....
        },
        "args": {
            "actions": [
                {
                    "action": "copy",
                    "src": "%(output)s/%(runid)s/profile.sqlite",
                    "dst": "%(data_root)s/shared/campaigns/profile_daily/profile.sqlite",
                    "backupsuffix": ".backup"
                },
             ]
        }
    }
Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def __init__(self, *args, **kwargs):
    super(FileOperationsBase, self).__init__(*args, **kwargs)
    self.name = "FileOperationsBase"
    self.outputs = {}
    self.dependencies = {}

preload_clean_args(args)

Clean when the spec is loaded...

Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def preload_clean_args(self, args):
    """
    Clean when the spec is loaded...
    """

    # Update the args
    args = super().preload_clean_args(args)

    # Sanity check...
    if not isinstance(args, dict):
        raise Exception("args should be a dictionary")

    if ("actions" not in args) or (not isinstance(args["actions"], list)):
        raise Exception("actions is missing or invalid")

    for a in args["actions"]:
        if not isinstance(a, dict):
            raise Exception("Each action spec should be a dictionary")
        supported = ["copy"]
        if a["action"] not in supported:
            raise Exception("Unsupported action")

        if a["action"] == "copy":
            if ("src" not in a) or ("dst" not in a):
                raise Exception(
                    "Each copy action spec should specify a src and dst"
                )

    return args

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    msg = ""
    actions = self.args["actions"]
    for a in actions:
        action = a["action"]

        # Pass any global variables...
        srcbase = self.config.get_file(a["src"], extra=self.args)
        dstbase = self.config.get_file(a["dst"], extra=self.args)

        if "files" not in a:
            copyactions = [{"src": srcbase, "dst": dstbase}]
        else:
            copyactions = []
            for f in a["files"]:
                copyactions.append(
                    {
                        "src": os.path.join(srcbase, f),
                        "dst": os.path.join(dstbase, f),
                    }
                )

        backupsuffix = a.get("backupsuffix", ".backup")
        data_root = self.config.get_file("%(enrich_data_dir)s")
        for ca in copyactions:
            src = ca["src"]
            dst = ca["dst"]

            if not os.path.exists(src):
                raise Exception("Could not find source: {}".format(src))

            if os.path.exists(dst):
                backupsuffix = datetime.now().strftime(backupsuffix)
                backupdst = dst + backupsuffix
                if os.path.exists(backupdst):
                    if os.path.isdir(backupdst):
                        shutil.rmtree(backupdst)
                    else:
                        os.remove(backupdst)
                os.rename(dst, dst + backupsuffix)

            try:
                os.makedirs(os.path.dirname(dst))
            except:
                pass

            # Handle the directory names
            if os.path.isdir(src):
                shutil.copytree(src, dst)
            else:
                shutil.copy(src, dst)

            msg += "Copy: {} => {}\n".format(
                os.path.relpath(src, data_root), os.path.relpath(dst, data_root)
            )

    logger.debug(
        "{} - Completed".format(self.name),
        extra=self.config.get_extra({"transform": self.name, "data": msg}),
    )

    ###########################################
    # => Return
    ###########################################
    return state

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

InMemoryQueryExecutorBase(*args, **kwargs)

Bases: AnonymizerMixin, Compute

Base class for an InMemory QueryExecutor transform. This is useful to run queries against backends such as backends such as mysql

Features of transform baseclass include:

* Support multiple query engines (via SQLAlchemy)
* Support templatized execution
* Support arbitrary number of queries
* Supports a generator function to generate per-interval queries

Configuration looks like::

...
"args": {
    "cleanup": False,
    "force": True,
    "targets": "all",
    "start_date": "2020-08-01",
    "end_date": "2020-08-03",
}

Specs

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "QueryExecutorBase"
    self.description = "Execute queries against backends"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

generic_clean(df)

Do a high level clean of the query result before doing a query-specific clean

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def generic_clean(self, df):
    """
    Do a high level clean of the query result before
    doing a query-specific clean
    """
    return df

get_engine(spec)

Build and return an engine for a given specification.

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_engine(self, spec):
    """
    Build and return an engine for a given specification.
    """
    raise Exception("Construct sqlalchemy engine")

get_registry()

Build a registry and return

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_registry(self):
    """
    Build a registry and return
    """
    return None

get_specs()

Use get_sql_specs instead.

.. warning:: .. deprecated:: 2.6.0

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_specs(self):
    """
    Use get_sql_specs instead.

    .. warning::
        .. deprecated:: 2.6.0

    """

    return []

get_specs_from_sqls(sqldir)

Helper function. Load specifications from the SQLs.

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_specs_from_sqls(self, sqldir):
    """
    Helper function. Load specifications from the SQLs.
    """
    specs = []

    files = glob.glob(sqldir + "/*.sql")
    for f in files:
        name = os.path.basename(f).replace(".sql", "")
        sql = open(f).read()

        # Specify the split in the SQL itself..
        segment = None
        match = re.search(r"-- segment:: (\S+)", sql)
        if match is not None:
            segment = match.group(1).strip()

        match = re.search(r"-- name:: (\S+)", sql)
        if match is not None:
            name = match.group(1).strip()

        match = re.search(r"-- engine:: (\S+)", sql)
        if match is not None:
            engine = match.group(1).strip()

        specs.append(
            {"name": name, "sql": sql, "segment": segment, "engine": engine}
        )

    return specs

get_sql_specs()

Return a list of query specifications.

Specification: A list of dictionaries. Each dict has

  • name: Name of the specification
  • sql: SQL template
  • categories: String or a list of strings indicating specification groups
  • segment: How to split the dataframe resulting from query execution. Could be none ('complete' as the default name), string (column name) or a callback that generates a { name: df } map
  • paramsets_duration: each instance for one 'day' or a window of days (defined below)
  • paramsets_window: each instance translates into date range for each instance of parameters.

Examples::

Simple: { "name": "txn_value", "sql": "txn_value.sql", "segment": "global_date", }

Simple:

 {
     "categories": ["kyc"],
     "name": "kyc_txn_summary",
     "sql": "kyc_txn_summary.sql",
     "segment": complex_split_callbak,
     "paramsets_duration": "day",
     "retries": 3,
 },
Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_sql_specs(self):
    """
    Return a list of query specifications.

    Specification: A list of dictionaries. Each dict has

      * name: Name of the specification
      * sql: SQL template
      * categories: String or a list of strings indicating specification groups
      * segment: How to split the dataframe resulting from query execution. Could be none ('complete' as the default name), string (column name) or a callback that generates a { name: df } map
      * paramsets_duration: each instance for one 'day' or a window of days (defined below)
      * paramsets_window: each instance translates into date range for each instance of parameters.

    Examples::

       Simple:
         {
             "name": "txn_value",
             "sql": "txn_value.sql",
             "segment": "global_date",
         }

       Simple:

         {
             "categories": ["kyc"],
             "name": "kyc_txn_summary",
             "sql": "kyc_txn_summary.sql",
             "segment": complex_split_callbak,
             "paramsets_duration": "day",
             "retries": 3,
         },

    """

    self.get_specs()

get_supported_extra_args()

Look at the specs to generate a list of options that can be presented to the end-ser

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_supported_extra_args(self):
    """
    Look at the specs to generate a list of options that
    can be presented to the end-ser
    """

    # Collect specs first..
    specs = self.get_sql_specs()

    # Compute the targets
    targets = ["all"]  # default
    for s in specs:
        if not s.get("enable", True):
            continue
        categories = s.get("categories", s.get('category', []))
        if isinstance(categories, str):
            categories = [categories]
        for c in categories:
            if c not in targets:
                targets.append(c)
    for s in specs:
        name = s["name"]
        if name not in targets:
            targets.append(name)
    targets = "|".join(targets)

    # Now construct the args dynamically
    remaining = self.supported_extra_args
    return [
        {
            "name": "targets",
            "description": f"What all to run. Specify multiple with comma separating names ({targets})",
            "default": "all",
            "required": False,
        },
        {
            "name": "force",
            "description": "Force execution",
            "default": "False",
            "required": False,
        },
        {
            "name": "start_date",
            "description": "Start of the time window",
            "default": get_yesterday(),
            "required": True,
        },
        {
            "name": "end_date",
            "description": "End of the time window",
            "default": get_today(),
            "required": True,
        },
    ] + remaining

preload_clean_args(args)

Check validity of the args

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def preload_clean_args(self, args):
    """
    Check validity of the args
    """
    args = super().preload_clean_args(args)

    if ("start_date" not in args) or ("end_date" not in args):
        raise Exception("Start or end of timeframe missing")

    try:
        start = dateparser.parse(args["start_date"]).date()
        args["start_date"] = start
        end = dateparser.parse(args["end_date"]).date()
        args["end_date"] = end
    except:
        logger.exception(
            "Invalid start_date or end_date", extra={"transform": self.name}
        )
        raise Exception("Invalid start/end datetime specified")

    if (
        ("targets" not in args)
        or (not isinstance(args["targets"], str))
        or (len(args["targets"]) == 0)
    ):
        raise Exception("Invalid list of query names specified")

    # Include force
    force = str(args["force"]).lower().strip()
    force = force == "true"
    args["force"] = force

    # Clean the list of names...
    targets = args["targets"].split(",")
    targets = [n.strip() for n in targets if len(n.strip()) > 0]
    args["targets"] = [n for n in targets if len(n) > 0]

    return args

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the registry
    self.registry = self.get_registry()

    # => Initialize anonymization if required
    if 'anonymization' in self.args:
        self.anonymize_init(self.args['anonymization'])

    # List of specification names
    targets = self.args["targets"]

    # Get specs..
    specs = self.get_sql_specs()

    logger.debug(f"Specs found: {len(specs)}", extra={"transform": self.name})
    # Now iterate through the specs.
    for spec in specs:
        try:

            name = spec["name"]
            categories = spec.get("categories",
                                  spec.get('category', ['all']))
            if isinstance(categories, str):
                categories = [categories]
            table = spec.get("table", name)
            cond = spec.get("cond", "")
            retries = spec.get("retries", 1)

            # To take care of the logging in case of exception
            msg = f"Name: {name}\n"

            # Check if this has been requested?
            if all([c not in targets for c in categories]) and (
                name not in targets
            ):
                continue

            logger.debug(
                f"Executing {spec['name']}",
                extra={
                    "transform": self.name,
                    "data": json.dumps(spec, indent=4, cls=SafeEncoder),
                },
            )

            sql_template = spec["sql"]

            files = []

            paramsets = self.generate_paramsets(
                spec, self.args["start_date"], self.args["end_date"]
            )

            for params in paramsets:

                status = []

                msg = f"Params: {params}\n"
                msg += f"Insert Table: {table} with {cond}\n"

                # Now log the SQL
                sql = sql_template % params
                msg += "SQL:\n{}\n".format(sql)

                # Get the engine for a given spec
                engine = self.get_engine(spec)

                segmentcol = spec.get("segment", None)

                tryno = 1
                while True:
                    if tryno > retries:
                        raise Exception("Exceeded max retries")

                    try:
                        df = pd.read_sql(sql, engine)
                        break
                    except:
                        logger.exception(
                            f"Failed Query: {name} (try {tryno})",
                            extra={"transform": self.name, "data": msg},
                        )
                    tryno += 1
                    time.sleep(30)

                # Do some basic cleaning. int becomes float
                df = self.generic_clean(df)

                msg += f"Segment: {segmentcol} (Initial split)\n"
                msg += "Records: {}\n".format(df.shape[0])
                msg += "Columns: {}\n".format(", ".join(df.columns))
                msg += "Dtypes: " + df.dtypes.to_string() + "\n"

                skip_empty = spec.get("skip_empty", True)
                if len(df) == 0:
                    # no data returned...
                    if skip_empty:
                        logger.warning(
                            f"Completed {name} {params['start_date']} No data",
                            extra={"transform": self.name, "data": msg},
                        )
                        continue
                    else:
                        logger.warning(
                            f"{name} {params['start_date']} No data",
                            extra={"transform": self.name, "data": msg},
                        )

                if ((len(df) == 0) and (not callable(segmentcol))):
                    msg = """Dont know how to handle an empty dataframe. Not sure what columns should be included with what values. segmentcol should be a callable""" + msg
                    logger.warning(f"{name} {params['start_date']} Skipping",
                                   extra={
                                       "transform": self.name,
                                       "data": msg
                                   })
                    continue

                # First gather a map of segments
                filemap = {}
                if segmentcol is None:
                    # Whole thing is one segment
                    filemap["complete"] = df
                elif isinstance(segmentcol, str):
                    # Split by column name...
                    segments = list(df[segmentcol].unique())
                    msg += f"Segments: {len(segments)} ({segmentcol})\n"
                    for segment in segments:
                        try:
                            df1 = df[df[segmentcol] == segment]
                            segment = str(segment)
                            filemap[segment] = df1
                        except:
                            pass
                elif callable(segmentcol):
                    # Custom split of the dataframe...
                    filemap = segmentcol(self, spec, params, df)
                    msg += f"Segments: {len(filemap)}\n"
                else:
                    raise Exception(f"Unhandled segment definition: {segmentcol}")

                # => Process each segment obtained from
                # the previous step...
                for segment, df1 in sorted(filemap.items()):

                    # Add note about what is being stored..
                    msg += f"[{segment}] {df1.shape[0]} records\n"

                    # Clean the output data...
                    try:
                        if "clean" in spec:
                            callback = spec["clean"]["callback"]
                            if callable(callback):
                                clean_msg, df1, clean_files = callback(
                                    self, segmentcol, segment, df1, spec
                                )
                                if len(clean_msg) > 0:
                                    msg += f"[{segment}] " + clean_msg + "\n"
                                files += clean_files
                    except Exception as e:
                        #traceback.print_exc()
                        msg += str(e)
                        raise

                    # Store in database...
                    try:
                        extra_dependencies = []
                        if "store" in spec:
                            # Separate storage handler..
                            callback = spec["store"]["callback"]
                            if callable(callback):
                                store_msg, store_dependencies, store_files = callback(
                                    self, segmentcol, segment, df1, spec
                                )
                                if len(store_msg) > 0:
                                    msg += f"[{segment}] " + store_msg + "\n"
                                extra_dependencies += store_dependencies
                                files += store_files

                    except Exception as e:
                        #traceback.print_exc()
                        msg += str(e)
                        raise

                    # Handle a default store for all specs, segments
                    try:


                        # => Anonymize the data
                        if hasattr(self, 'anonargs'):
                            anon_df1 = self.anonymize_target(spec['name'], df=df1)
                        else:
                            anon_df1 = None

                        # Store in s3 etc.
                        store_msg, store_dependencies, store_files  = self.store(
                            segmentcol, segment, df1, anon_df1, spec
                        )
                        if len(store_msg) > 0:
                            msg += f"[{segment}] " + store_msg

                        extra_dependencies += store_dependencies
                        files += store_files

                        # update lineage
                        self.update_frame(
                            name, engine, sql, df1, extra_dependencies
                        )


                    except Exception as e:
                        #traceback.print_exc()
                        msg += "[{}] Exception {}\n".format(segment, traceback.format_exc()) + "\n"

                logger.debug(
                    f"Completed {name} {params['start_date']}",
                    extra={"transform": self.name, "data": msg},
                )

            # Make note of it.
            dataset = self.registry.find(spec['name'])
            if ((dataset is not None) and (len(files) > 0)):
                metadata = { 'files': files}
                self.registry.access(dataset, metadata, nature='write')

        except:
            #traceback.print_exc()
            # Exception for each spec.
            logger.exception(
                f"Unable to run query: {name}",
                extra={"transform": self.name, "data": msg},
            )
            msg = ""
            continue

    self.add_marker(state)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

update_frame(name, engine, sql, df, dependencies=[])

Note the lineage for each output file.

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def update_frame(self, name, engine, sql, df, dependencies=[]):
    """
    Note the lineage for each output file.
    """

    # Check if it has already been registered
    if self.state.has_frame(name):
        return

    # Get the default database
    database = engine.url.database

    # Insert extra dependencies
    try:
        dependencies += get_lineage_of_query(engine, sql)
    except:
        dependencies = []
        logger.warning("Unable to get lineage",
                         extra={
                             'transform': self.name,
                             'data': f"SQL being checked:\n {sql}"
                         })

    # Generate column information...
    columns = self.get_column_metadata(name, df)

    ## => Gather the update parameters
    updated_detail = {
        "df": df,
        "description": f"Output for query {name}",
        "transform": self.name,
        "frametype": "pandas",
        "params": [
            {
                "type": "compute",
                "columns": columns,
            },
        ],
    }

    if len(dependencies) > 0:
        lineage = {"type": "lineage", "dependencies": dependencies}
        updated_detail['params'].append(lineage)

    # Dump it into the shared state
    self.state.update_frame(name, updated_detail, create=True)

MetricsBase(*args, **kwargs)

Bases: Compute

Compute metrics as input for the anomaly/other computation

Features of transform baseclass include:

* Flexible configuration
* Highlevel specification of dimensions and metrics
Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "MetricsBase"
    self.description = "Compute metrics against datasources"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

get_dataset_generic(source)

Use the dataset object to read the dataset

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_dataset_generic(self, source):
    """
    Use the dataset object to read the dataset
    """

    if (not hasattr(self, "read_data")) or (not hasattr(self, "get_dataset")):
        raise Exception(
            " get_dataset_generic expects read_data and get_dataset methods"
        )

    args = self.args

    start_date = source.get('start_date', self.args["start_date"])
    end_date = source.get('end_date', self.args["end_date"])

    name = source["name"]
    dataset = source["dataset"]
    params = source.get("params", {})
    filename = source.get('filename', 'data.csv')

    cache = args.get("cache", False)
    cachename = f"{dataset}-{start_date}-{end_date}-{filename}"
    cachefile = f"cache/{self.name}-cache-{cachename}"

    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            logger.debug(
                "Read cached {}".format(name), extra={"transform": self.name}
            )

            df = pd.read_csv(cachefile, **params)
            return {name: df}

    datasetobj = self.get_dataset(dataset)

    if hasattr(self, 'update_doodle'):
        self.update_doodle(datasetobj, source['filename'])

    df, metadata = datasetobj.read_data(
        start_date,
        end_date,
        filename=source["filename"],
        readfunc=self.read_data,
        params=params,
    )

    logger.debug("Read {}".format(name), extra={"transform": self.name})

    # Cache it for future use...
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if ("files" in metadata) and (len(metadata["files"]) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [metadata["files"][-1]],
                },
            ],
        }

    self.update_frame(source, df, lineage)

    return {name: df}

get_datasets(profile, specs)

Load the datasets specified by the profile

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_datasets(self, profile, specs):
    """
    Load the datasets specified by the profile
    """

    if not isinstance(profile, dict) or len(profile) == 0:
        logger.warning("Empty profile", extra={"transform": self.name})
        return {}

    # Get various kinds of handlers..
    handlers = self.get_handlers(profile)
    if not isinstance(handlers, dict) or len(handlers) == 0:
        logger.warning("No handlers specified", extra={"transform": self.name})
        handlers = {}

    required_sources = []
    for s in specs:
        required_sources.extend(s["sources"])
    required_sources = list(set(required_sources))

    # Now no about constructucting the datasets
    datasets = {}

    found = []
    sources = self.get_sources(profile)
    for source in sources:

        nature = source.get("nature", "db")
        name = source["name"]

        if name not in required_sources:
            continue
        found.append(name)

        pipeline = source.get("pipeline", None)
        generate = source.get("generate", None)

        # Only db is used for now...
        try:
            if nature == "db":
                result = self.read_db_source(source)
            elif (
                (generate is not None)
                and (generate in handlers)
                and (callable(handlers[generate]))
            ):
                result = handlers[generate](source)
            elif (generate is not None) and (hasattr(self, generate)):
                result = getattr(self, generate)(source)
            else:
                raise Exception(f"Invalid specification: {name}")
        except:
            logger.exception(
                f"[{name}] generation failed", extra={"transform": self.name}
            )
            continue

        # Clean the read the dataset...
        try:
            if pipeline is not None and isinstance(pipeline, list):
                for processor in pipeline:
                    if isinstance(processor, str):
                        if processor in handlers:
                            result = handlers[processor](result, source)
                        elif hasattr(self, processor):
                            result = getattr(self, processor)(result, source)
                        else:
                            raise Exception(f"Missing post-processor: {processor}")
                    elif callable(processor):
                        result = processor(result, source)
                    else:
                        raise Exception(
                            "Only method names/callables are supported are supported"
                        )
        except:
            logger.exception(
                f"[{name}] post-processing failed", extra={"transform": self.name}
            )
            continue

        # We could return multiple values or a single value
        if isinstance(result, dict):
            datasets.update(result)
        else:
            datasets[name] = result

    missing = [s for s in required_sources if s not in found]
    if len(missing) > 0:
        logger.error(
            f"Missing {len(missing)} sources",
            extra={
                "transform": self.name,
                "data": ", ".join(missing)
            }
        )
        raise Exception("Missing sources")

    return datasets

get_db_uri(source)

Return database URI for a source

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_db_uri(self, source):
    """
    Return database URI for a source
    """
    return source["uri"]

get_handlers(profile)

Define various callbacks that take a dataframe, spec and compute. Specific to a single profile.

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_handlers(self, profile):
    """
    Define various callbacks that take a dataframe, spec
    and compute. Specific to a single profile.
    """
    return {}

get_printable_db_uri(engine)

pretty print the URL

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_printable_db_uri(self, engine):
    """
    pretty print the URL
    """
    username = engine.url.username
    host = engine.url.host
    database = engine.url.database
    drivername = engine.url.get_driver_name()

    return f"{drivername}:///{host}/{database}/"

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, None)
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    # Get specs..
    specs = self.get_specs(profile)

    # First get the datasets
    datasets = self.get_datasets(profile, specs)

    # Now go through each spec and get the output..
    for spec in specs:
        enable = spec.get("enable", True)
        if not enable:
            continue
        self.process_spec(datasets, profile, spec)

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

process_spec_default(datasets, profile, spec)

Handle one specification at a time..

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def process_spec_default(self, datasets, profile, spec):
    """
    Handle one specification at a time..
    """

    if ("dimensions" not in spec) or (not isinstance(spec["dimensions"], dict)):
        raise Exception("Dimensions in spec should be a dict")

    if ("metrics" not in spec) or (not isinstance(spec["metrics"], dict)):
        raise Exception("Metrics in spec should be a dict")

    # Get hold of the data first...
    sources = self.get_spec_sources(spec, datasets)

    if len(sources) > 1:
        raise Exception("Use custom spec handler for multiple sources")

    datasetdf = list(sources.values())[0]

    # now go through each of the dimensions
    dimensions = spec["dimensions"]
    metrics = spec["metrics"]

    _dfs = []
    for name, cols in dimensions.items():

        if isinstance(cols, str):
            cols = [cols]

        # Dont need to include other columns...
        relevant = cols + list(metrics.keys())
        df = datasetdf[relevant]

        # Check if there are lists and explode them...
        for col in cols:
            if isinstance(df.iloc[0][col], list):
                df = df.explode(col)

        # Construct aggregates...
        df = df.groupby(cols)
        df = df.agg(metrics)

        # Clean up the index if multiple columns are specified
        if len(cols) > 1:
            df.index = df.index.map("+".join)
        df.index.name = "value"
        df = df.reset_index()

        # Also cleanup the column names...
        def clean_colname(what):
            if isinstance(what, (list, tuple)):
                what = "_".join(what)
                what = what.rstrip("_").lstrip("_")
            return what

        df.columns = df.columns.map(clean_colname)

        df.insert(0, "dimensions", name)

        _dfs.append(df)

    # merge all
    df = pd.concat(_dfs)
    del _dfs

    return {spec["name"]: df}

NotebookExecutorBase(*args, **kwargs)

Bases: Compute

A built-in transform baseclass to handle standard notebook operation and reduce the duplication of code.

Features of this transform include:

* Support for custom args and environment
* Support for automatic capture and surfacing of output and err

Configuration looks like::

 class MyTestNotebook(NotebookExecutorBase):

     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.name = "TestNotebook"
         self.notebook = os.path.join(thisdir, "Test-Notebook.ipynb")

     @classmethod
     def instantiable(cls):
         return True

     def get_environment(self):
         return {
             'SECRET': credentials
         }
Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "NotebookExecutorBase"
    self.notebook = None
    self._environ = os.environ.copy()

get_environment()

Pass any additional parameters...

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def get_environment(self):
    """
    Pass any additional parameters...
    """
    return {}

get_notebook()

Define notebook that must be executed

Returns:

str: Path to the notebook
Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def get_notebook(self):
    """
    Define notebook that must be executed

    Returns:

        str: Path to the notebook
    """
    if (
        (not hasattr(self, "notebook"))
        or (self.notebook is None)
        or (not isinstance(self.notebook, str))
        or (not os.path.exists(self.notebook))
    ):
        raise Exception(
            "Missing notebook. Missing/invalid path: {}".format(
                getattr(self, "notebook", "")
            )
        )

    notebook = self.notebook
    notebook = os.path.abspath(notebook)
    return notebook

preload_clean_args(args)

Standard args preprocessor. Make sure that an artifacts directory is created for storing the configuration file, output notebook and stdout/err.

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def preload_clean_args(self, args):
    """
    Standard args preprocessor. Make sure that
    an artifacts directory is created for storing the
    configuration file, output notebook and stdout/err.

    """
    args = super().preload_clean_args(args)

    # Insert artifacts if not available..
    if "artifacts" not in args:
        args["artifacts"] = self.get_file(
            "%(output)s/%(runid)s/artifacts", create_dir=True
        )
        try:
            os.makedirs(args["artifacts"])
        except:
            pass

    return args

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    config = self.args
    configfile = os.path.join(config["artifacts"], "config.json")
    dump = lambda: json.dumps(config, indent=4, default=str)
    with open(configfile, "w") as fd:
        fd.write(dump())

    logger.debug(
        "Parameters to script",
        extra={
            "transform": self.name,
            "data": "Config: {}\n---\n".format(configfile) + dump(),
        },
    )

    # Update the environ
    _environ = os.environ.copy()
    try:
        # Update the environ
        update = self.get_environment()
        os.environ.update(update)

        # Now run the notebook
        self.run_notebook(config, configfile)

    finally:
        os.environ.clear()
        os.environ.update(_environ)

    return state

validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

SyntheticDataGeneratorBase(*args, **kwargs)

Bases: Compute

Generate synthetic data given a specification

Features of transform baseclass include: * Flexible configuration * Highlevel specification of synthetic data in each column * instance: pre-defined faker-based instances * distribution: pre-defined from statistical distributions * custom: custom defined in base/derived class

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "SyntheticDataGeneratorBase"
    self.description = "Generate synthetic data from a specification"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
    self.fakeObj = None # for synthetic data generation, to be inited later on

anon_email(data, col_name, column)

Method to anonymize email data. Can generate emails to match or not match data in some name field. Also respects original email domain distribution if required. Input is the full dataframe, output is the relavant column being anonymized.

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anon_email(self, data, col_name, column):
    '''
    Method to anonymize email data. Can generate emails to match or not match
    data in some name field. Also respects original email domain distribution if required.
    Input is the full dataframe, output is the relavant column being anonymized.
    '''
    msg = ""

    match_names = column.get("match_names", True)
    if match_names is True:
        if "name_field" not in column:
            msg += f"Column {col_name} -- Unknown name field to match emails, setting random emails" + "\n"
            match_names = False
            return np.nan
        else:
            if column["name_field"] not in data.columns:
                msg += f"Column {column['name']} -- name field not in dataframe, setting random emails" + "\n"
                match_names = False
                return np.nan

    def generate_email(fakeObj, row, col_name, column, match_names):
        # whitelist of email domains
        # if the origninal email is in this list, don't replace it
        # useful to maintain data distribution
        domain_whitelist = ['gmail.com',
                            'yahoo.com',
                            'hotmail.com',
                            'aol.com']

        email_col_name  = col_name
        orig_domain     = row[email_col_name].split('@')[1]

        # set the email domain first
        if column.get("dist", "yes") == "yes":
            # we need to ensure that the distribution of generated email domains
            # match what was present in the input
            # popular free email domains will carry over, while others will be
            # replaced with random domains while still retaining distribution
            if any([d==orig_domain for d in domain_whitelist]):
                # retain the origninal domain name
                domain = orig_domain
            else:
                # get a new domain name
                domain = fakeObj['generators']['email_domain'][orig_domain]
        else:
            # no need to match distribution of generated email domains
            domain = fakeObj['faker'].ascii_email().split('@')[1]

        if match_names is True:
            # we want to match the anon email with the name field
            name = row[column['name_field']]
            names = unidecode.unidecode(name).lower().split(' ')
        else:
            # we don't care about matching the anon email with the name field
            names = fakeObj['faker'].name().split(' ')

        firstname = names[0]
        lastname = names[-1]

        # possible variations of email
        nameparts = {
            1: f"{firstname}",
            2: f"{lastname}",
            3: f"{firstname}.{lastname}",
            4: f"{firstname}.{firstname[0]}.{lastname}",
            5: f"{firstname}.{lastname[0]}.{lastname}",
            6: f"{firstname}.{firstname[0]}.{lastname[0]}",
            7: f"{firstname}.{random.randint(1,10000)}",
            8: f"{firstname}_{random.randint(1,10000)}",
            9: f"{firstname}.{lastname}.{random.randint(1,10000)}",
        }
        choice = random.randint(1, len(nameparts))
        namepart = nameparts[choice]
        email = f"{namepart}@{domain}"

        return email

    val = data.apply(lambda x: generate_email(self.fakeObj, x, col_name, column, match_names), axis=1)

    return val

anon_numeric(data, col_name, column)

Method to fuzz numeric data. Various fuzzing methods can be defined here. Input is the full dataframe, output is the relavant column being fuzzed.

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anon_numeric(self, data, col_name, column):
    '''
    Method to fuzz numeric data. Various fuzzing methods
    can be defined here.
    Input is the full dataframe, output is the relavant column being fuzzed.
    '''
    msg = ""

    method      = column.get("method", "perturb")
    params      = column.get("params", {})

    val = data[col_name]

    if method == "perturb":
        range = params.get("range", 0.05)
        val += random.uniform(-range*val, range*val)
    else:
        msg = f"Column {column['name']} -- Unknown method to anon column, setting default NaNs" + "\n"
        val = np.nan

    return val

anonymize_dataset(spec, data)

Anonymize a dataset given a spec

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anonymize_dataset(self, spec, data):
    '''
    Anonymize a dataset given a spec
    '''
    msg = ""
    name    = spec['name']
    config  = spec['config']

    # whether to anonymize all columns or a spec is defined
    columns_to_anon = "all" if "columns" not in config else "given"

    df_columns = data.columns
    columns = config.get('columns', {})

    # run through each column and try to anonymize
    anon_columns = []
    for col_name, col_obj in columns.items():
        include = col_obj.get("include", "yes")
        if include != "yes":
            continue
        params = {}
        if col_name not in df_columns:
            msg += f"Column: {col_name} not found, skipping" + "\n"
        else:
            data[col_name], l_msg = self.anonymize_single_column(col_name, col_obj, data, params)
            anon_columns.append(col_name)
            msg += l_msg

    # drop the other columns is required by spec
    action = config.get("nontransformed", "retain")
    if action == "drop":
        data = data[anon_columns]

    msg += note(data, "Anonymized dataset") + "\n"

    logger.debug(
        f"Spec: {name} dataset anonymized",
        extra={"transform": self.name, "data": msg}
    )

    return data

anonymize_single_column(col_name, col_obj, data, params)

Takes a dataset and anonymizes the specified column

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anonymize_single_column(self, col_name, col_obj, data, params):
    '''
    Takes a dataset and anonymizes the specified column
    '''
    msg = ""

    # get the faker object
    fakeObj = self.fakeObj

    # setup handlers for the various anonymization types
    generators = {}
    # first for the lookup generators
    for g, lookup in fakeObj['generators'].items():
        generators[g] = {
            "type": "lookup",
            "handler": lookup
        }
    # then for the custom generators
    generators["numeric"] = {
        "type": "custom",
        "handler": "anon_numeric"
    }
    generators["email"] = {
        "type": "custom",
        "handler": "anon_email"
    }

    anon_type = col_obj['anon_type']
    _d = []
    if anon_type in generators:
        gen_type = generators[anon_type]['type']
        gen_handler = generators[anon_type]['handler']
        if gen_type == "lookup":
            # we call the apply only on the specific column
            data = data[col_name].apply(lambda x: gen_handler[x])
        else:
            handler = getattr(self, gen_handler)
            # we call the apply to the full dataframe, we may need other columns
            # return is only the relevant column
            data = handler(data, col_name, col_obj)
        msg += f"Column: {col_name} anonymized" + "\n"
    else:
        data = np.nan
        msg += f"Column: {col_name} -- No <{anon_type}> generator found, defaulting to NaN" + "\n"

    return data, msg

get_dataset_s3(spec)

Use the dataset object to read the dataset

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def get_dataset_s3(self, spec):
    """
    Use the dataset object to read the dataset
    """
    run_date    = self.args['run_date']
    name        = spec["name"]
    config      = spec['config']


    for f in ["dataset", "filename"]:
        if f not in config:
            msg = f"{f} param needed in config " + "\n"
            logger.exception(
                f"Dataset: {name} -- skipping", extra={"transform": self.name, "data": msg}
            )
            return None

    source      = config.get('source', 'registry')
    dataset     = config['dataset']

    dataset     = config["dataset"]
    pieces      = dataset.split('-')
    dataset_main = "-".join(pieces[:-1])
    dataset_subset = pieces[-1]
    filename    = config["filename"]
    params      = config.get("params", {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-anonymizer-cache-" + cachename + ".csv"

    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            msg += note(df, f"Cached {dataset}") + "\n"
            logger.debug(f"Read cached {name}", extra={"transform": self.name, "data": msg})
            return df

    if source == "registry":
        if not hasattr(self, "get_dataset"):
            raise Exception(
                "get_dataset_s3 expects get_dataset method"
            )
        datasetobj = self.get_dataset(dataset_main) # this method should be defined in the derived class

        if hasattr(self, 'update_doodle'):
            self.update_doodle(datasetobj, filename)

        df, metadata = datasetobj.read_data(
            run_date,
            run_date,
            filename=filename,
            readfunc=self.read_s3_data,
            params=params,
        )
    elif source == "direct":
        params = {}
        df = self.read_s3_data(filename, params)
        metadata = { "files": [filename] }
    else:
        logger.exception(
            f"Dataset: {name} -- unknown source param: {source}, skipping", extra={"transform": self.name}
        )
        return None

    msg = note(df, f"Fresh {dataset}") + "\n"
    logger.debug(f"Read fresh {name}", extra={"transform": self.name, "data": msg})

    # Cache it for future use...
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if ("files" in metadata) and (len(metadata["files"]) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [metadata["files"][-1]],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # init the faker object for data generation
    self.fakeObj = self.init_faker_object()

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "syntheticdata")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    # get the dataset lookup table
    customer_datasets = profilespec.construct_dataset_list(self, profile)

    # Now go through each dataset and generate synthetic data for it
    for spec in specs:

        process_spec = True

        enabled = spec.get("enable", True)
        if not enabled:
            logger.debug(
                f"Spec <{spec.get('name', 'NO NAME')}> not enabled, skipping.",
                extra={"transform": self.name}
            )
            process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.error(
                    f"Spec has no {f} param set, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                process_spec = False
                break

        if process_spec == False:
            # something is wrong with this spec, skip it
            continue

        # process the spec
        name    = spec['name']
        action  = spec.get('action', 'anonymize')

        if action == 'generate':
            # we have a generate a synthetic dataset
            frametype = "synthetic"
            data = self.generate_dataset(spec)
        elif action == 'anonymize':
            # we have to anonymize a given dataset
            frametype = "anonymized"
            # frist, load it
            data = self.load_dataset(spec, customer_datasets)

            # then, anonymize it
            if data is not None:
                data = self.anonymize_dataset(spec, data)
            else:
                msg = "Could not anonymize dataset" + "\n"
                logger.exception(
                    f"Spec: {spec['name']} -- skipping",
                    extra={"transform": self.name}
                )
        else:
            logger.exception(
                f"Unknown action param in spec, skipping spec: {spec['name']}",
                extra={"transform": self.name}
            )

        # store the generated dataset
        if data is not None:
            self.store_result(spec, data)

            # update frame for pipline
            description = spec.get(f"desc -- {frametype}", f"{frametype.title()} generated dataset")
            lineage = {
                "type": "lineage",
                "transform": self.name,
                "dependencies": [
                    {
                        "type": "file",
                        "nature": "input",
                        "objects": [spec.get("filename", "__NEW__")],
                    },
                ],
            }
            self.update_frame(
                spec,
                description,
                data,
                lineage,
            )

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

TimeSeriesForecasterBase(*args, **kwargs)

Bases: Compute

Take a timeseries and project it's future values with exogenous variables. Features of transform baseclass include: * Flexible configuration * Highlevel specification of time series forecasting * specified data source or custom method to generate one * by default, forecast using facebook's prophet library or custom defined ones using other libraries

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "TimeSeriesForecasterBase"
    self.description = "Forecast future values of a timeseries"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

    self.default_strategy = "prophet"
    self.default_type = "vanilla"

    self.epoch = time.time()    #for output path

combined_dataset(spec, data)

Adds the combined dataset to the data dict

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def combined_dataset(self, spec, data):
    """
    Adds the combined dataset to the data dict
    """
    config = spec['config']
    combined_dataset = pd.DataFrame()

    if "combine_sources" in config:
        combine_sources = config["combine_sources"]
        dataset = combine_sources.get("dataset", None)

        if hasattr(self, dataset):
            params = combine_sources.get("params", {})
            handler = getattr(self, dataset)
            combined_dataset =  handler(params, data, spec)
            data['combined'] = combined_dataset

            msg = note(combined_dataset, "Combined dataset")
            logger.debug(f"Combined dataset for {spec['name']}",
                         extra={"transform": self.name, "data": msg})

    return data

get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df

get_datewindow(source, spec)

Set the time window for observations and exogenous variables. Get both of these from args parameters if not start_date defaults to 60 days prior to end date end_date is day prior to run_date, which is usually today

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def get_datewindow(self, source, spec):
    """
    Set the time window for observations and exogenous variables.
    Get both of these from args parameters
    if not start_date defaults to 60 days prior to end date
    end_date is day prior to run_date, which is usually today
    """
    datewindow = {}
    default_delta = 60

    run_date = self.args['run_date']

    try:
        if 'end_date' in self.args and self.args['end_date']:
            end_date = datetime.fromisoformat(self.args['end_date'])
        else:
            logger.debug(
                f"End date not in args. Using yesterday's date.")
            end_date = run_date - timedelta(days=1)

        if 'start_date' in self.args and self.args['start_date']:
            start_date = datetime.fromisoformat(self.args['start_date'])
        else:
            logger.debug(
                f"Start date not in args. Using {default_delta} days prior to end date. ")
            start_date = end_date - timedelta(days=default_delta)
    except Exception as e:
        logger.exception(
            f"Error parsing date window for {spec['name']}.",
            extra={"transform": self.name, "data": self.args}
        )
        datewindow = None
        return datewindow

    if start_date > end_date:
        logger.exception(
                    f"Start date greater than end date. Skipping the spec {spec['name']}.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
        datewindow = None # be explicit
        return datewindow

    datewindow['start_date'] = start_date
    datewindow['end_date'] = end_date

    return datewindow

get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}

load_source(spec)

Load all the sources to a 'data' dict modifies the 'data' dict.

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def load_source(self, spec):
    """
    Load all the sources to a 'data' dict
    modifies the 'data' dict.
    """
    config = spec['config']
    source = config['source']

    data = {}
    is_valid = True

    if 'observations' not in source:
        logger.exception(
            f"Spec config has no observations param, skipping.",
            extra={"transform": self.name, "data": json.dumps(spec, indent=4)})
        is_valid_spec = False
        return is_valid

    if 'exovars' not in source:
        logger.debug(
            f"Exogenous variables not specified in {spec['name']}",
            extra={"transform": self.name, "data": json.dumps(spec, indent=4)})

    # get time_window for observations and exovars
    datewindow = self.get_datewindow(source, spec)
    if datewindow is None:
        logger.debug(
            f"Invalid date window for {spec['name']}",
            extra={"transform": self.name})
        is_valid_spec = False
        return is_valid, data

    data['observations'] = {}
    for dataname, dataspec in source['observations'].items():
        dataset = self.load_dataset(spec, dataname, dataspec, datewindow)
        data["observations"][dataname] = dataset

    # then load the exovars data set if specified
    if "exovars" in source:
        data['exovars'] = {}
        for dataname, dataspec in source['exovars'].items():
            dataset = self.load_dataset(spec, dataname, dataspec, datewindow)
            data["exovars"][dataname] = dataset

    return is_valid, data

postprocess_results(spec, result)

Postprocess the results. The method defined in the subclass

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def postprocess_results(self, spec, result):
    """
    Postprocess the results. The method defined in the subclass
    """
    config = spec['config']
    # do post_process results
    postprocess_results = config.get('postprocess_results', None)
    if postprocess_results:
        method = postprocess_results.get('method', "")
        params = postprocess_results.get('params', {})
        handler = getattr(self, method, None)
        if handler:
            result = handler(spec, result, params)
        else:
            logger.exception(
                f"Spec: {spec['name']} -- postprocess_results method not found",
                extra={"transform": self.name}
            )
    logger.debug(f"Postprocess results for {spec['name']} done",
                 extra={"transform": self.name})

    return result

precheck_spec(spec)

Check if the spec is valid

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def precheck_spec(self, spec):
    '''
    Check if the spec is valid
    '''
    is_valid_spec = True
    name = spec.get('name', 'NO_SPEC_NAME')

    enabled = spec.get("active", True)
    if not enabled:
        logger.debug(
            f"Spec not enabled, skipping.",
            extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
        )
        is_valid_spec = False
        return is_valid_spec

    for f in ["name", "config"]:
        if f not in spec:
            logger.exception(
                f"Spec has no {f} param, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            is_valid_spec = False
            return is_valid_spec

    config = spec['config']

    for f in ["source", "forecasters"]:
        if f not in config:
            logger.exception(
                f"Spec config has no {f} param, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            is_valid_spec = False
            return is_valid_spec

    return is_valid_spec

process(state)

Run the computation and update the state 1. Load the datasets 2. Run forecasting 3. process the forecasting results 4. store the results

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    1. Load the datasets
    2. Run forecasting
    3. process the forecasting results
    4. store the results
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.forecasting")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")
    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        do_process_spec = self.precheck_spec(spec)
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec
        # load source
        do_process_spec, data = self.load_source(spec)
        if do_process_spec == False:
            continue

        # post process the sources
        data = self.combined_dataset(spec, data)

        # run the forecasters
        result = self.process_spec(spec, data)
        if result is None:
            continue

        # postprocess the results
        result = self.postprocess_results(spec, result)

        # tag the result under the spec name
        result = {spec['name']: result}

        # store the  results
        self.store_result(spec, result)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

process_spec(spec, data)

Process the forecaster spec. generate result and chart for each forecaster

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def process_spec(self, spec, data):
    """
    Process the forecaster spec.
    generate result and chart for each forecaster
    """

    def store_chart(tsf):
        '''
        We generate multiple charts for each forecaster
        Immeidately store the charts as we generate
        '''
        viz = tsf['viz']
        filename = f"{forecaster_name}-{name}-forecasting.png"
        msg = self.store_viz(spec, filename, viz)
        tsf.pop('viz', None)
        return msg

    msg = ""
    name = spec["name"]
    config = spec["config"]

    forecasters = config['forecasters']

   # if forecasters is not a dict
    if not isinstance(forecasters, dict):
        logger.exception("Forecasters must be a dict",
                         extra={"transform": self.name})
        raise Exception("Forecasters must be a dict")

    forecasters = config['forecasters']

    result = {"forecasts": {}}
    for forecaster_name, forecaster in forecasters.items():

        tsf = self.run_forecasting(spec, data, forecaster_name, forecaster)
        msg+= store_chart(tsf)

        logger.debug(f"Processed and then aved visualization for {forecaster_name}",
                extra={"transform": self.name, "data": msg})

        result['forecasts'][forecaster_name] = tsf

    logger.debug(f"Done processing all the forecasters",
            extra={"transform": self.name})

    return result

run_forecasting(spec, data, forecaster_name, forecaster)

Instantiate the forecaster and run forecasting

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def run_forecasting(self, spec, data, forecaster_name, forecaster):
    """
    Instantiate the forecaster and run forecasting
    """
    # default is prophet
    # type is vanilla
    strategy = forecaster.get('strategy', self.default_strategy)
    type = forecaster.get('type', self.default_type)
    params = forecaster.get('params', {})

    # return timeseries forecast
    tsf = {}
    chart_params = params.get('chart_params', {})

    if strategy == 'prophet':

        if type == "vanilla":
            observation  = params.get('observation', None)

            if observation is None:
                logger.exception(f"Observation time series must be specified for forecaster: {forecaster_name}",
                                extra={"transform": self.name, "data": json.dumps(forecaster, indent=4)})
                raise Exception("Observation must be specified for prophet forecaster")

            df = data['observations'][observation]

            forecast_obj = BaseProphetForecasterModel(df)
            forecast = forecast_obj.run_forecasting(params)

            viz = forecast_obj.visualize_forecasting(forecast, chart_params)
            del forecast_obj

        elif type ==  "exogenous":
            df = data['combined']

            forecast_obj = BaseProphetForecasterModel(df)
            forecast = forecast_obj.run_forecasting(params)
            viz = forecast_obj.visualize_forecasting(forecast, chart_params)
            del forecast_obj

        else:
            logger.excption(f"Invalid type for prophet forecaster: {forecaster_name}",
                            extra={"transform": self.name, "data": json.dumps(forecaster, indent=4)})
            raise Exception("Invalid type for prophet forecaster")

    tsf = {
        "forecast" : forecast,
        "viz" : viz,
    }
    msg = note(forecast, f"Forecast for {forecaster_name}")
    logger.debug(f"Forecasted time series for {forecaster_name}",
                    extra={"transform": self.name, "data": msg})

    return tsf

anomalies

AnomaliesBase(*args, **kwargs)

Bases: Compute

Compute anomalies given a dataframe with columns

Features of transform baseclass include:

* Flexible configuration
* Highlevel specification of columns combinations and detection strategy
Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "AnomaliesBase"
    self.description = "Compute anomalies in column(s) of a dataframe"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
get_dataset_s3(spec, paths)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def get_dataset_s3(self, spec, paths):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = config['dataset']

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path)
        if _df is None:
            msg += f"Path not found, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [paths],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
get_profile()

Read the profile json from API

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def get_profile(self):
    """
    Read the profile json from API
    """

    if (not hasattr(self, "args")):
        raise Exception(
            "'args' transform attribute should be defined to use default get_profile method"
        )
    for p in ['apicred']:
        if self.args.get(p) == None:
            raise Exception(
                f"'{p}' attribute in args should be defined to use default get_profile method"
                )

    # call the API to get the anomaly specs
    anomalyspecs, is_valid, msg = load_profile_api(self.args)
    logger.debug(
        f"Loading profile from API",
        extra={"transform": self.name, "data": msg},
    )
    if is_valid == False:
        raise Exception(f"Error loading profile")

    specs = anomalyspecs["specs"]
    logger.debug(
        f"Found {len(specs)} specs",
        extra={"transform": self.name, "data": json.dumps(anomalyspecs, indent=4)},
    )

    return anomalyspecs
preprocess_spec(spec)

to be overloaded in the derived class

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def preprocess_spec(self, spec):
    '''
    to be overloaded in the derived class
    '''
    return spec
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the anomaly profile
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.outliersv2")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and generate anomaly reports
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source_id"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break

        if not do_process_spec:
            continue

        ## pre-process the spec
        try:
            spec = self.preprocess_spec(spec)
            logger.debug(f"Preproccessed spec: {spec['name']}",
                         extra={
                             'transform': self.name,
                             'data': json.dumps(spec, indent=4)
                         })

            ## we can now proceed with processing the spec
            # frist, load the source data
            data = self.load_dataset(spec)

            ## process the spec to detect outliers
            data = self.process_spec(spec, data)

            if ((not isinstance(data, dict)) or
                (len(data) == 0)):
                continue

            # write the detected outliers
            self.store_result(spec, data)
        except:
            logger.exception(f"Failed to process {name}",
                             extra={
                                 'transform': self.name
                             })

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state
process_spec_default(data, spec)

Handle one specification at a time..

Source code in enrichsdk/contrib/lib/transforms/anomalies/__init__.py
def process_spec_default(self, data, spec):
    """
    Handle one specification at a time..
    """

    partialsamplerate = 0.05
    samplerate_lut = {
        "all": 1.0,
        "partial": partialsamplerate,
        "none": 0.0
    }
    tolerances = {
        "low": 1,
        "medium": 2,
        "high": 3,
    }

    def anomaly_note(row, threshold):
        distance = row[f"__anomaly_distance__"]
        if distance > threshold:
            return f"{(round(distance/threshold,2))}x outside expected sample deviation"
        return f"within expected sample deviation"


    msg = ""
    msg += f"Using default centroid distance anomaly detector" + "\n"

    config = spec["config"]
    msg += f"Config: {json.dumps(config, indent=4)}" + "\n"

    # Get hold of the data first...
    name = spec["name"]
    orig_df = data
    total_samples = len(orig_df)

    metrics     = config.get("metrics", orig_df.columns)
    groups      = config.get('groups', [])
    outputs     = config.get("outputs", orig_df.columns)
    dimensions  = config.get("dimensions", orig_df.columns)
    columns     = list(set(metrics + outputs + dimensions))

    msg += f"Combined set of columns: {columns}" + "\n"
    msg += f"{note(orig_df, 'Original DF')}" + "\n"

    #########
    # default anomaly detection
    #########
    # get tolerance thresold
    tolerance = config.get("threshold", config.get("thresold", "medium"))
    scalefactor = tolerances.get(tolerance, 2)

    # get the sample strategy for the normal data
    normal_samples = config.get("normal_samples", "partial")
    samplerate = samplerate_lut[normal_samples]

    msg += f"(tolerance, scalefactor): ({tolerance}, {scalefactor})" + "\n"

    logger.debug(f"Setting up for spec: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })

    anomaly_stats = {}
    plotdata = {}
    dfs = []

    #########
    # we first do the leaf level, per metric to check for anomalies
    #########
    msg = f"Processing metrics: {metrics}" + "\n\n"

    for metric in metrics:

        # make a copy of the df, we'll keep adding anomlay metrics to it
        df = orig_df[columns].copy()

        if not is_numeric_dtype(df[metric]):
            msg += f"{metric} Metric not numeric. Skipping\n"
            continue

        # compute the anomalies for this metric
        points      = df[metric].to_numpy()     # all data as an MxN matrix
        centroid    = df[metric].mean()          # the computed centroid of the dataset
        distances   = abs(points - centroid)    # distances of each point to centroid
        stddev      = np.nanstd(points)      # std dev of distances
        threshold   = stddev * scalefactor
        anomalies   = np.where(distances.flatten()>threshold, 'anomaly', 'normal')    # flag where anomalies occur

        # add columns indicating anomaly label
        id = f"metric-{metric}"
        df['id'] = id
        df['level'] = 'metric'
        df['name'] = metric
        df['__is_anomaly__'] = pd.Series(anomalies)

        # add columns indicating reason for anomaly
        df[f"__anomaly_distance__"] = pd.Series(distances.flatten())
        df[f"__anomaly_note__"] = df.apply(lambda x: anomaly_note(x, threshold), axis=1)

        df_a = df[df['__is_anomaly__']=='anomaly']
        n_anomalies = len(df_a)
        perc_anomalies = round(n_anomalies/total_samples*100, 2)

        df_n = df[df['__is_anomaly__']=='normal'].sample(frac=samplerate)
        df_n = df_n[0:min(3*n_anomalies,len(df_n))] # min 3x n_anomalies or configured sample of normal samples
        n_nsamples = len(df_n)

        # for this metric, we now have all the detected anomalies and the sampled normal data
        sampled_df = pd.concat([df_a, df_n])

        msg += f"--------------------------" + "\n"
        msg += f"Metric: {metric}" + "\n"
        msg += f"Computed stddev: {stddev}" + "\n"
        msg += f"Threshold: {threshold}" + "\n"
        msg += f"Anomalies: {n_anomalies}/{total_samples}={perc_anomalies}%" + "\n"
        msg += f"--------------------------" + "\n\n"

        anomaly_stats[id] = {
            "level": 'metric',
            "name": metric,
            "dimensions": dimensions,
            "n_anomalies": n_anomalies,
            "perc_anomalies": perc_anomalies,
            "n_normalsamples": n_nsamples,
            "n_plotsamples": len(df),
        }
        plotdata[id] = df

        dfs.append(sampled_df)

    logger.debug(f"Processed metrics level: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })


    # #########
    # # then we do the group level, hierarchial
    # #########
    msg = f"Processing groups: {groups}" + "\n\n"

    for group in groups:
        group_name      = group.get('group')
        g_dimensions    = group.get('dimensions', dimensions)
        g_metrics       = group.get('metrics')

        # we don't have what we need, skip
        if group_name == None or metrics == None:
            continue

        if not all([is_numeric_dtype(df[metric]) for metric in g_metrics]):
            msg += f"{group_name} One or more metrics are not numeric\n"
            continue

        # make a copy of the df, we'll keep adding anomlay metrics to it
        df = orig_df[columns].copy()

        points      = df[g_metrics].to_numpy()    # all data as an MxN matrix
        centroid    = df[g_metrics].mean().values # the computed centroid of the dataset
        distances   = distance.cdist(points, np.array([centroid]), 'euclidean') # distances of each point to centroid
        distances   = np.reshape(distances, len(distances))
        stddev      = np.nanstd(points)         # std dev of distances
        threshold   = stddev * scalefactor
        anomalies   = np.where(distances.flatten()>threshold, 'anomaly', 'normal')    # flag where anomalies occur

        # add columns indicating anomaly label
        id = f"group-{group_name}"
        df['id'] = id
        df['level'] = 'group'
        df['name'] = group_name
        df['__is_anomaly__'] = pd.Series(anomalies)

        # add columns indicating reason for anomaly
        df[f"__anomaly_distance__"] = pd.Series(distances.flatten())
        df[f"__anomaly_note__"] = df.apply(lambda x: anomaly_note(x, threshold), axis=1)

        df_a = df[df['__is_anomaly__']=='anomaly']
        n_anomalies = len(df_a)
        perc_anomalies = round(n_anomalies/total_samples*100, 2)

        df_n = df[df['__is_anomaly__']=='normal'].sample(frac=samplerate)
        df_n = df_n[0:min(3*n_anomalies,len(df_n))] # min 3x n_anomalies or configured sample of normal samples
        n_nsamples = len(df_n)

        # for this metric, we now have all the detected anomalies and the sampled normal data
        sampled_df = pd.concat([df_a, df_n])

        msg += f"--------------------------" + "\n"
        msg += f"Group: {group_name}" + "\n"
        msg += f"Computed stddev: {stddev}" + "\n"
        msg += f"Threshold: {threshold}" + "\n"
        msg += f"Anomalies: {n_anomalies}/{total_samples}={perc_anomalies}%" + "\n"
        msg += f"--------------------------" + "\n"

        anomaly_stats[id] = {
            "level": 'group',
            "name": group_name,
            "metrics": g_metrics,
            "dimensions": g_dimensions,
            "threshold": threshold,
            "n_anomalies": n_anomalies,
            "perc_anomalies": perc_anomalies,
            "n_normalsamples": n_nsamples,
            "n_plotsamples": len(df),
        }
        plotdata[id] = df

        dfs.append(sampled_df)

    logger.debug(f"Processed groups level: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })

    if len(dfs) == 0:
        logger.debug(f"{name}: No outputs computed",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })
        return None

    #########
    # construct the DF for output
    #########
    # concat for all metrics+groups
    df = pd.concat(dfs)
    # reorder columns
    first_cols = ['id', 'level', 'name']
    cols = first_cols + [c for c in df.columns if c not in first_cols]
    df = df[cols]

    msg = f"Final columns: {df.columns}" + "\n"

    window, start_date, end_date = self.get_window_dates(config, self.args)

    # compute stats of interest
    stats = {
        "timestamp": f"{datetime.now().isoformat()}",
        "policy": config,
        "data_start_date": f"{start_date}",
        "data_end_date": f"{end_date}",
        "strategy": "centroid",
        "tolerance": tolerance,
        "scalefactor": scalefactor,
        "normalsamples": normal_samples,
        "samplerate": samplerate,
        "n_rows": total_samples,
        "anomaly_stats": anomaly_stats,
    }

    msg += f"Stats: {json.dumps(stats, indent=4)}" + "\n"

    msg += f"{note(df, 'Anomaly DF')}" + "\n"

    logger.debug(f"Completed spec: {spec['name']}",
                     extra={
                         'transform': self.name,
                         'data': msg
                     })

    return {name: df, "stats": stats, "plotdata": plotdata}

changepoints

ChangePointDetectorBase(*args, **kwargs)

Bases: Compute

Take a timeseries signal and identify changepoints in the signal

Features of transform baseclass include: * Flexible configuration * Highlevel specification of change point detection: * specified data source or custom method to generate one * generic change point detection method or custom defined ones

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "ChangePointDetectorBase"
    self.description = "Change point(s) detection for a timeseries signal given a spec"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

    self.cautions = {
        "low": {"color": "green", "desc": "MINIMAL", "rec": "In a BUSINESS-AS-USUAL regime now."},
        "medium": {"color": "gold", "desc": "LOW to MODERATE", "rec": "Expect LOW to MODERATE swings in this regime."},
        "high": {"color": "red", "desc": "HIGH to EXTREME", "rec": "Stay alert for HIGH to EXTREME swings in this regime."},
    }

    self.epoch = time.time()    #for output path
get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.changepoint")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source", "detector"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec
        source = config['source']
        for f in ["indicator", "observations"]:
            if f not in source:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        # expections have occured. continue to next spec
        if do_process_spec == False:
            continue

        # get time_window for indicator and observation data set
        datewindow = self.get_datewindow(source, spec)
        if datewindow is None :
            do_process_spec = False
            continue

        # first, load the indicator dataset
        data = {}
        data['indicator'] = self.load_dataset(spec, 'indicator', source['indicator'], datewindow)
        if data['indicator'] is None:
            do_process_spec = False
            continue

        # then, load all the observations datasets
        data['observations'] = {}
        for ob_dataset, dataset in source['observations'].items():
            data['observations'][ob_dataset] = self.load_dataset(spec, ob_dataset, dataset, datewindow)

        # then, process it
        result = self.process_spec(spec, data)
        if result is None:
            continue

        ## store the expectation validation result
        self.store_result(spec, result)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state
process_spec_default(spec, data)

Run the default change point detection

Source code in enrichsdk/contrib/lib/transforms/changepoints/__init__.py
def process_spec_default(self, spec, data):
    """
    Run the default change point detection
    """

    ## note: we need to save viz objects immediately after creation
    ## or they will be overwritten when the next viz object is created

    msg = ""
    name = spec['name']

    indicator_ts = data['indicator']
    cpd = self.run_changepoint_detector(indicator_ts, name)
    changepoints = cpd['changepoints']

    # save changepoint visualization
    viz = cpd['viz']
    filename = f"{name}-changepoints.png"
    l_msg = self.store_viz(spec, filename, viz)
    msg += l_msg
    cpd.pop('viz', None)

    observations = {}
    for observation, observation_ts in data['observations'].items():
        regimes = self.compute_regimes(indicator_ts, observation_ts, changepoints)
        viz = self.visualize_regimes(observation_ts, regimes)
        observations[observation] = {
            "regimes": regimes,
            "viz": viz
        }
        # save regimes visualization
        filename = f"{name}-regime-{observation}.png"
        l_msg = self.store_viz(spec, filename, viz)
        msg += l_msg
        observations[observation].pop('viz', None)

    logger.debug(
        f"Saved visualizations",
        extra={"transform": self.name, "data": msg}
    )

    result = {
        "changepoints": cpd,
        "observations": observations
    }

    return result

classifier

ClassifierBase(*args, **kwargs)

Bases: Compute

Take a training dataset and one or more eval datasets Builds a classification model using the training dataset Applies the model on the eval dataset(s) and generates predictions

Features of transform baseclass include: * Flexible configuration * Highlevel specification of steps in ML classification flow: * specify multiple datasets (one for training, one or more for evaluation) * specify optional dataset prep methods * specify training model details with support for imbalanced datasets * specify evaluation strategy on one or more datasets

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "ClassifierBase"
    self.description = "Classification of data using a trained ML model"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

    self.epoch = time.time()    #for output path
do_training(profilespec, modelspec, X, y, model, cv, metric)

Train a model given a dataset and a pipeline

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def do_training(self, profilespec, modelspec, X, y, model, cv, metric):
    """
    Train a model given a dataset and a pipeline
    """

    msg = ""

    name = modelspec['name']

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots(figsize=(9, 9))
    for fold, (train, test) in enumerate(cv.split(X, y)):
        model.fit(X[train], y[train])
        viz = RocCurveDisplay.from_estimator(
            model,
            X[test],
            y[test],
            name=f"ROC fold {fold}",
            alpha=0.3,
            lw=1,
            ax=ax,
        )
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")

    # get the final model fit
    model.fit(X, y)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(
        mean_fpr,
        mean_tpr,
        color="b",
        label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
        lw=2,
        alpha=0.8,
    )

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(
        mean_fpr,
        tprs_lower,
        tprs_upper,
        color="grey",
        alpha=0.2,
        label=r"$\pm$ 1 std. dev.",
    )

    ax.set(
        xlim=[-0.05, 1.05],
        ylim=[-0.05, 1.05],
        xlabel="False Positive Rate",
        ylabel="True Positive Rate",
        title=f"[{name}] Mean ROC curve with variability')",
    )
    ax.axis("square")
    ax.legend(loc="lower right", fontsize=16)
    plt.tight_layout()

    # save training visualization
    filename = f"train-{name}-roc.png"
    l_msg = self.store_viz(profilespec, filename, plt)

    msg += l_msg

    # return the appropriate metric
    if metric == "auc":
        metric_val = mean_auc
    elif metric == "tpr":
        metric_val = mean_tpr
    elif metric == "fpr":
        metric_val = mean_fpr
    else:
        metric_val = mean_auc

    classifier = {
        "model": model,
        "metric": metric_val
    }

    return classifier, msg
get_classifier_pipeline(model)

Construct the classifier pipeline 1. resampling 2. classifier model

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def get_classifier_pipeline(self, model):
    """
    Construct the classifier pipeline
        1. resampling
        2. classifier model
    """

    # do we need to resample the data
    # supports upsampling the minority class for now
    resample = model.get("resample")
    if resample == 'random':
        resampler = RandomOverSampler()
    elif resample == 'smote':
        resampler = SMOTE()
    else:
        # no resampling by default
        resampler = None

    # then get the classifier algorithm
    algorithm = model.get("model", {}).get("algorithm")
    params = model.get("model", {}).get("params", {})
    if algorithm == "knn":
        classifier = KNeighborsClassifier(**params)
    elif algorithm == "svm":
        classifier = svm.SVC(**params)
    else:
        # use the kNN algorithm by default
        classifier = KNeighborsClassifier(n_neighbors=3)

    # construct the pipeline
    if resampler == None:
        pipeline = classifier
    else:
        pipeline = make_pipeline(resampler, classifier)

    return pipeline
get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
load_sources(profilespec)

Load all the data sources

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def load_sources(self, profilespec):
    """
    Load all the data sources
    """
    data = {}

    for source in profilespec.get('sources', []):
        name = source.get('name', 'NOT_SPECIFIED')

        # check for all fields needed
        if any(p not in source for p in ['nature', 'name', 'filename', 'stage']):
            logger.error(f"Malformed source [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(source, indent=4)
                         }))
            continue

        if source['nature'] == 'disk':
            filename = source['filename']
            filename = f"{self.args['root']}/{filename}"
            df = pd.read_csv(filename)
            if df is None:
                logger.error(f"Source not found [{name}]",
                             extra=self.config.get_extra({
                                 'transform': self.name,
                                 'data': json.dumps(source, indent=4)
                             }))
                continue

            data[name] = df

        else:
            continue

        self.update_state(name, df, f"Source: {name}")

    # we have loaded all available data sources
    return data
make_predictions(profilespec, data, classifiers, artifacts)

Generate predictions for the various eval datasets

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def make_predictions(self, profilespec, data, classifiers, artifacts):
    """
    Generate predictions for the various eval datasets
    """

    # to collect all the results
    results = {}

    # for each prediction spec in the profilespec
    for spec in profilespec.get('predict', []):
        # process it
        name = spec['name']
        if spec.get('enable', True) == False:
            logger.error(f"Spec [{name}] disabled, skipping",
                         extra=self.config.get_extra({
                             'transform': self.name
                         }))
            continue

        _dfs = []
        for source in spec.get('sources', []):
            _dfs.append(data[source])
        if len(_dfs)>0:
            eval_df = pd.concat(_dfs)
        else:
            logger.error(f"No sources to eval, skipping",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(spec, indent=4)
                         }))
            continue

        # get the target column name
        target = spec.get("target")
        ignore = spec.get("ignore", [])
        if target == None:
            logger.error(f"Target column not specified, skipping eval [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(spec, indent=4)
                         }))
            continue

        # we now have eval_df
        # check if a prep data method is specified
        prep_data = profilespec.get("prep", {}).get("method")
        if prep_data != None:
            if hasattr(self, prep_data):
                handler = getattr(self, prep_data)
                prepped_eval_df, artifacts, msg = handler(eval_df, artifacts, 'predict')

            logger.debug(f"Prepped eval data [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': msg
                         }))

        # check if all required columns are present
        missing_cols = set(artifacts['columns']).difference(set(prepped_eval_df.columns))
        for c in missing_cols:
            prepped_eval_df[c] = 0
        logger.debug(f"Added missing columns [{name}]",
                     extra=self.config.get_extra({
                         'transform': self.name,
                         'data': f"Missing cols: {missing_cols}"
                     }))


        # we now have the prepped eval df
        # run the specified classifier on it
        classifier_name = spec.get("model", "best")
        if classifier_name == "best":
            classifier_name = classifiers["best"]

        classifier = classifiers[classifier_name]['model']

        # create the data arrays
        X = prepped_eval_df[[c for c in prepped_eval_df.columns if c not in [target]+ignore]].to_numpy()

        # make the predictions
        r = classifier.predict(X)
        eval_df["__prediction"] = pd.Series(r)

        result = {
            "spec": spec,
            "n_datapoints": len(eval_df),
            "n_predictions": eval_df["__prediction"].value_counts().to_dict()
        }
        results[name] = result

        logger.debug(f"Predictions done [{name}]",
                     extra=self.config.get_extra({
                         'transform': self.name,
                         'data': note(eval_df, f"Predictions [{name}]")
                     }))

        # store results data csv
        self.store_result_data(profilespec, spec, result, eval_df)

    return results
prep_data(profilespec, data, artifacts)

Do any data prep needed We may need to do data scaling, normalization, etc. here Any artifacts of the prep that will be needed by the prediction stage must be returned in this function

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def prep_data(self, profilespec, data, artifacts):
    """
    Do any data prep needed
    We may need to do data scaling, normalization, etc. here
    Any artifacts of the prep that will be needed by the
    prediction stage must be returned in this function
    """

    # setup the training data and artifacts
    train_data = None
    for source in profilespec['sources']:
        if source['stage'] == "train":
            train_data = source['name']
    if train_data == None:
        return data, artifacts

    # check if a prep data method is specified
    prep_data = profilespec.get("prep", {}).get("method")
    if prep_data == None:
        return data, artifacts

    # call the prep data method
    msg = ""
    if hasattr(self, prep_data):
        handler = getattr(self, prep_data)
        data[train_data], artifacts, msg = handler(data[train_data], artifacts, 'train')

    logger.debug(f"Prepped training data [{train_data}]",
                 extra=self.config.get_extra({
                     'transform': self.name,
                     'data': msg
                 }))

    return data, artifacts
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug("Start execution",
                 extra=self.config.get_extra({
                     'transform': self.name
                 }))
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.classifier")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ###
        # first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "sources", "train", "predict"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ###
        # get all the data sources
        ##############
        # Re-write to use standard load_dataset(...) method
        # when using API-based spec and s3 data sources
        data = self.load_sources(spec)
        ##############
        if len(data) == 0:
            logger.exception("No datasources found, failing",
                         extra=self.config.get_extra({
                             'transform': self.name
                         }))
            raise Exception("No datasources")

        ###
        # model training stage
        classifiers, artifacts = self.train_models(spec, data)

        ###
        # make predictions for each evaluation dataset
        # and store results
        results = self.make_predictions(spec, data, classifiers, artifacts)

        # Store the metadata with results
        self.store_metadata(spec, results)

    # Done
    logger.debug("Complete execution",
                 extra=self.config.get_extra({
                     'transform': self.name
                 }))

    ###########################################
    # => Return
    ###########################################
    return state
store_metadata(spec, results)

Store all the metadata for the full run

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def store_metadata(self, spec, results):
    """
    Store all the metadata for the full run
    """
    metadata = self.get_default_metadata(self.state)
    metadata['spec'] = spec
    metadata['results'] = results

    store = spec.get("store", ["disk"])

    if "s3" in store:
        # store in s3
        appname     = spec.get('app',self.name)
        name        = spec['name']
        namespace   = spec.get('namespace', 'default')
        run_date    = self.args['run_date']
        s3          = self.args['s3']
        epoch       = self.epoch

        # where are we storing it?
        targetdir = os.path.join(self.args['s3root'], f"{appname}/{namespace}/{name}/{run_date}/{epoch}")
        metadatafile = os.path.join(targetdir, f"metadata.json")

        # write to s3
        with s3.open(metadatafile, 'w') as fd:
            json.dump(metadata, fd, indent=4, cls=SafeEncoder)
    if "db" in store:
        # store in db
        self.db_store_metadata(spec, predictspec, result, df)
    if "disk" in store:
        # store in disk
        name = spec['name']
        outfile = os.path.join(self.args['output'], f"{name}/metadata.json")
        with open(outfile, 'w') as fd:
            fd.write(json.dumps(metadata,indent=4))
train_models(profilespec, data)

Model training

Source code in enrichsdk/contrib/lib/transforms/classifier/__init__.py
def train_models(self, profilespec, data):
    """
    Model training
    """
    msg = ""

    # prep the training data
    # and generate any artifacts needed later
    artifacts = {}
    data, artifacts = self.prep_data(profilespec, data, artifacts)
    # we need a list of all columns which will be used in training
    for source in profilespec['sources']:
        if source['stage'] == "train":
            train_data = source['name']
            artifacts['columns'] = list(data[train_data].columns)

    # required params
    trainspec = profilespec.get("train")
    metric  = trainspec.get("metric", "auc") #what is the metric against which to compare models
    folds   = trainspec.get("folds", 1)      #how many folds for cross validation

    classifiers = {}

    # for each model to train
    models = trainspec.get("models", [])
    for model in models:
        if model.get("enable", True) == False:
            continue

        name    = model.get("name", f"{hashlib.md5(json.dumps(model).encode('utf-8')).hexdigest()}")
        model['name'] = name
        dataset = model.get("source")
        target  = model.get("target")
        ignore  = model.get("ignore", [])

        if dataset == None or dataset not in data:
            logger.error(f"Dataset not known, skipping training [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(model, indent=4)
                         }))
            continue
        if target == None:
            logger.error(f"Target column not specified, skipping training [{name}]",
                         extra=self.config.get_extra({
                             'transform': self.name,
                             'data': json.dumps(model, indent=4)
                         }))
            continue

        msg += f"Model: {name}" + "\n"
        msg += f"Dataset: {dataset}" + "\n"
        msg += f"Target column: {target}" + "\n"
        msg += f"Ignore columns: {ignore}" + "\n"

        df = data[dataset]

        # create the data arrays
        X = df[[c for c in df.columns if c not in [target]+ignore]].to_numpy()
        y = df[target].to_numpy()

        msg += f"Size (X): {X.size}" + "\n"
        msg += f"Size (y): {y.size}" + "\n"

        # figure out the minority class
        # in case we need to resample
        class_distribution = pd.Series(y).value_counts(normalize=True)
        pos_label = class_distribution.idxmin()
        msg += f"Positive label: {pos_label}" + "\n"

        # construct the classifier pipeline object
        classifier_pipeline = self.get_classifier_pipeline(model)

        # set up the n-fold cross validation
        cv = StratifiedKFold(n_splits=folds)

        # do model training
        classifiers[name], l_msg = self.do_training(profilespec, model, X, y, classifier_pipeline, cv, metric)
        msg += l_msg

    # decide on what the best classifier is based on the metric
    classifiers['best'] = self.decide_best_classifier(classifiers)

    msg += f"Classifiers: {json.dumps(classifiers, indent=4, cls=SafeEncoder)}" + "\n"
    msg += f"Artifacts: {json.dumps(artifacts, indent=4, cls=SafeEncoder)}" + "\n"

    logger.debug(f"Completed training",
                 extra=self.config.get_extra({
                     'transform': self.name,
                     'data': msg
                 }))

    return classifiers, artifacts

data_quality

DataQualityBase(*args, **kwargs)

Bases: Compute

Run data quality checks against a data source based on a spec

Features of transform baseclass include: * Flexible configuration * Highlevel specification of observability: * specified data source * custom defined data quality checks (same DSL as Great Expectation python package)

Source code in enrichsdk/contrib/lib/transforms/data_quality/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "DataQualityBase"
    self.description = "Data quality checks for a data source given a spec"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
get_dataset_s3(spec, paths)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/data_quality/__init__.py
def get_dataset_s3(self, spec, paths):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = config['dataset']

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path)
        if _df is None:
            msg += f"Path not found, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [paths],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/data_quality/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/data_quality/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.dataqualityv2")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source_id", "expectations"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec
        # frist, load the source data
        data = self.load_dataset(spec)

        # then, process it
        result = self.process_spec(spec, data)
        if result is None:
            continue

        ## store the expectation validation result
        self.store_result(spec, result)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

data_sanitizer

DataSanitizerBase(*args, **kwargs)

Bases: Compute

Sanitize data based on rules.

Features of transform baseclass include: * Flexible configuration * Highlevel specification of transformations * specified data source * custom defined rules

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "DataSanitizerBase"
    self.description = "Sanitize data based on rules"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec['name'] + "-raw", f"Raw Dataset: {spec['name']}", df, lineage)

    return df    
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/data_sanitizer/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra={"transform": self.name}
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.dataqualityv2")
    if is_valid:
        name = profile.get('name', 'unknown')
        logger.debug(
            f"Loaded profilespec: {name}",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("active", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4, cls=SafeEncoder)}
            )
            do_process_spec = False
            continue

        for f in ["name", "source", "transformations"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4, cls=SafeEncoder)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec            
        source = spec['source']

        # get time_window for indicator and observation data set
        datewindow = self.get_datewindow(source, spec)
        if datewindow is None :
            do_process_spec = False
            continue

        ## we can now proceed with processing the spec
        # first, load the source data
        data = self.load_dataset(spec, source, datewindow)

        # then, process it
        result, msg = self.process_spec(spec, data)
        if result is None:
            continue

        ## store the expectation validation result
        self.store_result(spec, result, {'notes': msg})


    # Done
    logger.debug(
        "Complete execution", extra={"transform": self.name}
    )

    ###########################################
    # => Return
    ###########################################
    return state

feature_compute

FeatureComputeBase(*args, **kwargs)

Bases: Compute

A built-in transform baseclass to handle standard feature computation and reduce the duplication of code.

This should be used in conjunction with an FeaturesetExtractor & FeatureExtractor

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "FeatureComputeBase"
    self._environ = os.environ.copy()
get_featureset_extractors()

Get all the featureset extractors (not feature extractors)

Returns: list: A list of extractors as a name, extractor combination

For example::

return [{
     "name": "patient",
     "extractor": <featureset extractor instance>
}]
Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def get_featureset_extractors(self):
    """
    Get all the *featureset* extractors (not feature extractors)

    Returns:
         list: A list of extractors as a name, extractor combination

    For example::

        return [{
             "name": "patient",
             "extractor": <featureset extractor instance>
        }]

    """
    raise Exception("Implement in subclass")
get_objects()

Get a list of objects (typically names) to process. Could be dictionaries, lists etc. The list is not interpreted by the base class. Could be a list of identifier.

Returns: list: A list of objects (could be ids/paths/dicts etc.)

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def get_objects(self):
    """
    Get a list of objects (typically names)  to process. Could be dictionaries,
    lists etc. The list is not interpreted by the base class. Could be a list of
    identifier.

    Returns:
       list: A list of objects (could be ids/paths/dicts etc.)

    """
    if "root" not in args:
        raise Exception("Base class implementation required 'root'")

    root = self.args["root"]
    files = os.listdir(root)
    return files
instantiable() classmethod

Return true if class can be instantiated. Override in subclass

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
@classmethod
def instantiable(cls):
    """
    Return true if class can be instantiated. Override in subclass
    """
    return False
process(state)

Core loop

Rough logic::

get featureset extractors
get objects
for each object:
     for each featureset extractor X
         process one object with X
         collect one featureset 'row' for X

for each featureset extractor X
Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def process(self, state):
    """
    Core loop

    Rough logic::

        get featureset extractors
        get objects
        for each object:
             for each featureset extractor X
                 process one object with X
                 collect one featureset 'row' for X

        for each featureset extractor X

    """

    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    self.state = state

    # What extractors to run on the data..
    featureset_extractors = self.get_featureset_extractors()
    featureset_extractors = [
        f for f in featureset_extractors if f.get("enable", True)
    ]

    # Go through all the available objects
    objects = self.get_objects()
    logger.debug(f"Received {len(objects)} objects", extra={"transform": self.name})

    # Compute the features..
    final = compute_features(objects, featureset_extractors, self.read_object)

    # Update the frame
    for name, df in final.items():
        if isinstance(df, pd.DataFrame):
            self.update_frame(
                name + "_features",
                "Features computed over the available data",
                df,
                objects[0],
            )

    # Store the result...
    files = self.store(final)

    registry = self.get_registry()
    dataset = registry.find(list(final.keys()))
    metadata = {
        'files': files
    }
    registry.access(dataset, metadata, 'write')

    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state
read_object(obj)

Read one object returned by get_objects

Args: obj (object): One item in the list of objects

Returns: object: An object like dict or list of dicts

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def read_object(self, obj):
    """
    Read one object returned by get_objects

    Args:
        obj (object): One item in the list of objects

    Returns:
         object: An object like dict or list of dicts

    """

    if "root" not in args:
        raise Exception("Base class implementation required 'root'")

    root = self.args["root"]
    filename = os.path.join(root, obj)
    data = json.load(open(filename))
    return data
store(data)

Store the final result

Args: data (dict): name of featureset -> data associated with it

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def store(self, data):
    """
    Store the final result

    Args:
        data (dict): name of featureset -> data associated with it

    """
    raise Exception("Implement in subclass")
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/feature_compute/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

filebased_query_executor

FileBasedQueryExecutorBase(*args, **kwargs)

Bases: Compute

Base class for a File-based QueryExecutor transform. This is useful to run queries against backends such as backends such as mysql

Features of transform baseclass include:

* Support query engines (MySQL, Hive, Presto)
* Support templatized execution
* Support arbitrary number of queries
* Supports a generator function to generate per-interval queries

Configuration looks like::

...
"args": {
    "cleanup": False,
    "force": True,
    "names": "all",
    "start": "2020-08-01",
    "end": "2020-08-03",
}
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "FileBasedQueryExecutorBase"
    self.description = "Execute queries against backends and store in files"
    self.supported_extra_args = [
        {
            "name": "names",
            "description": "names of the queries to execute",
            "default": "all",
            "required": False,
        },
        {
            "name": "force",
            "description": "Force execution",
            "default": "False",
            "required": False,
        },
        {
            "name": "start",
            "description": "Start of the time window",
            "default": "",
            "required": True,
        },
        {
            "name": "end",
            "description": "End of the time window",
            "default": "",
            "required": True,
        },
    ]

    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
generator_daily(spec, specitem, query)

Built-in function to generate a list of dates (one for each day) between two dates.

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def generator_daily(self, spec, specitem, query):
    """
    Built-in function to generate a list of dates (one for each day)
    between two dates.

    """

    start = self.args["start"]
    end = self.args["end"]
    if start > end:
        start, end = end, start

    if isinstance(start, datetime):
        start = start.date()
    if isinstance(end, datetime):
        end = end.date()

    # Pass any extra parameters
    extra = query.get("params", {})
    paramlist = []
    dt = start
    while dt < end:
        params = {"dt": dt.isoformat()}
        params.update(extra)

        dt += relativedelta.relativedelta(days=1)
        paramlist.append(params)

    return paramlist
get_executor(specitem, query, credentials)

Get executor for a specitem and credentials. This executor runs the query.

The executor could be specified within the query, spec, or could default to built-in one based on the credentials and dbtype within.

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute
credentials (dict): Credentials for the backend

Returns:

a callable executor
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_executor(self, specitem, query, credentials):
    """
    Get executor for a specitem and credentials. This executor
    runs the query.

    The executor could be specified within the query,
    spec, or could default to built-in one based on the
    credentials and dbtype within.

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute
        credentials (dict): Credentials for the backend

    Returns:

        a callable executor
    """

    default_executor = None
    if credentials["dbtype"] == "mysql":
        default_executor = self.mysql_executor
    elif credentials["dbtype"] == "hive":
        default_executor = self.hive_executor

    # Executor can be per query or for the entire set
    executor = query.get("executor", specitem.get("executor", default_executor))

    if (executor is not None) and callable(executor):
        return executor

    if (executor is not None) and hasattr(self, executor):
        return getattr(self, executor)

    raise Exception("Cant find executor: {}".format(executor))
get_generator(specitem, query)

Parameters generator. This is useful when a templatized query has to be run against the backend over many days. The output of the generator function is a list of dictionaries each of which is a key-value set for one time window (say a day)

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute

Returns:

a callable generator function
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_generator(self, specitem, query):
    """
    Parameters generator. This is useful when a templatized
    query has to be run against the backend over many
    days. The output of the generator function is a list of
    dictionaries each of which is a key-value set for one
    time window (say a day)

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute

    Returns:

        a callable generator function
    """
    generator = query.get("generator", specitem.get("generator", "generator_daily"))
    if (generator is not None) and callable(generator):
        return generator

    if (generator is not None) and hasattr(self, generator):
        return getattr(self, generator)

    raise Exception("Could not find generator: {}".format(generator))
get_output_handler(query, params)

Find a handler for the output of the query. This function should be over-ridden to compute the handler dynamically.

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_output_handler(self, query, params):
    """
    Find a handler for the output of the query. This function
    should be over-ridden to compute the handler dynamically.

    """
    if "output" not in query:
        raise Exception("Unable to determine output handler. No 'output' in query")

    if isinstance(query["output"], str):
        return FileOutputHandler(self, query["output"])

        raise Exception("Unable to determine output. No 'output' in query")
get_spec()

Get query execution specification. Override this

Returns:

specs (list): A list of dictionaries. Each dictionary specifies name, credentials, queries to run

Example::

return [ { "name": "roomdb", "cred": "roomdb", "queries": [ { "name": "select_star", "output": "%(data_root)s/shared/db/select_star/%(dt)s.tsv", "sql": "%(transform_root)s/SQL/select_star.sql", "params": { "alpha": 22 } } ] }, { "enable": False, "name": "hive", "cred": "hiveserver", "queries": [ { "name": "employees", "output": "%(data_root)s/shared/db/employee/%(dt)s.tsv", "sql": "%(transform_root)s/SQL/employees.hql", } ] } ]

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def get_spec(self):
    """
    Get query execution specification. Override this

    Returns:

       specs (list): A list of dictionaries. Each dictionary
                     specifies name, credentials, queries to run

    Example::

       return [
           {
               "name": "roomdb",
               "cred": "roomdb",
               "queries": [
                   {
                       "name": "select_star",
                       "output": "%(data_root)s/shared/db/select_star/%(dt)s.tsv",
                       "sql": "%(transform_root)s/SQL/select_star.sql",
                       "params": {
                        "alpha": 22
                       }
                   }
               ]
           },
           {
               "enable": False,
               "name": "hive",
               "cred": "hiveserver",
               "queries": [
                   {
                       "name": "employees",
                       "output": "%(data_root)s/shared/db/employee/%(dt)s.tsv",
                       "sql": "%(transform_root)s/SQL/employees.hql",
                   }
               ]
           }
       ]

    """
    return []
hive_executor(specitem, credentials, query, params)

Built in executor for queries against a hive backend. The output is dumped to a temporary file and then an output handler is called for post-processing.

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute
credentials (dict): Credentials for the backend
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def hive_executor(self, specitem, credentials, query, params):
    """
    Built in executor for queries against a hive backend. The output
    is dumped to a temporary file and then an output handler is called
    for post-processing.

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute
        credentials (dict): Credentials for the backend

    """
    try:

        targetdir = None

        # Should this be forced?
        force = self.args.get("force", False)
        cleanup = self.args.get("cleanup", True)

        # Get the output filename (s3, hdfspath etc.)
        handler = self.get_output_handler(query, params)

        if (not force) and handler.exists(params):
            logger.debug(
                "[Skipping:Exists] {} {}".format(query["name"], params["dt"]),
                extra={"transform": self.name},
            )
            return

        logger.debug(
            "[Computing] {} {}".format(query["name"], params["dt"]),
            extra={"transform": self.name},
        )

        # Create a temp directory
        targetdir = tempfile.mkdtemp(prefix="query_executor_")

        # Process the credentials
        config = get_mysql_config(credentials)

        # Instantiate the sql
        sqlfile = self.get_file(query["sql"])
        if not os.path.exists(sqlfile):
            raise Exception("Invalid sql file: {}".format(sqlfile))
        sql = open(sqlfile).read()

        # Resolve the sql content
        sql = sql.format(**params)

        sqlname = os.path.join(targetdir, "run.sql")
        with open(sqlname, "w") as fd:
            fd.write(sql)

        # => Now write the script
        tmpname = os.path.join(targetdir, "output.tsv")

        cmd = (
            "beeline -u jdbc:hive2://%(host)s:%(port)s --silent=true --verbose=False --outputformat=tsv"
            % config
        )

        if "user" in config:
            cmd += " -n '{}'".format(user)

        if "password" in config:
            cmd += " -p '{}'".format(password)

        # Generate the script to run
        script = """#!/bin/bash\n\n"""
        script += "date\n"
        script += "{} -f {} > {}\n".format(cmd, sqlname, tmpname)
        script += "date\n"
        script += "[ -s {0} ] && sed -i 's/\\r//g' {0}\n".format(tmpname)
        scriptname = os.path.join(targetdir, "run.sh")
        with open(scriptname, "w") as fd:
            fd.write(script)

        try:
            process = subprocess.Popen(
                ["/bin/bash", scriptname],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            out, err = process.communicate()
            logger.debug(
                "Executed the script",
                extra={
                    "transform": self.name,
                    "data": "Output:\n----\n"
                    + out.decode("utf-8")
                    + "\n\nError:\n-----\n"
                    + err.decode("utf-8"),
                },
            )
        except:
            logger.exception(
                "Error while executing the script",
                extra={
                    "transform": self.name,
                },
            )

        # => Now post-process it..
        handler.process(tmpname, params)

    except:
        logger.exception("Failed to execute", extra={"transform": self.name})

    try:
        if cleanup and (targetdir is not None) and os.path.exists(targetdir):
            shutil.rmtree(targetdir)
        else:
            logger.warning(
                "Targetdir not removed",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Targetdir: {}".format(targetdir),
                    }
                ),
            )
    except:
        logger.exception(
            "Cleanup failed",
            extra=self.config.get_extra(
                {"transform": self.name, "data": "Targetdir: {}".format(targetdir)}
            ),
        )
mysql_executor(specitem, credentials, query, params)

Built in executor for queries against a mysql backend. The output is dumped to a temporary file and then an output handler is called for post-processing.

Args:

spec (dict): Specification of the query
query (dict): Particular query to execute
credentials (dict): Credentials for the backend
Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def mysql_executor(self, specitem, credentials, query, params):
    """
    Built in executor for queries against a mysql backend. The output
    is dumped to a temporary file and then an output handler is called
    for post-processing.

    Args:

        spec (dict): Specification of the query
        query (dict): Particular query to execute
        credentials (dict): Credentials for the backend

    """
    try:

        targetdir = None

        # Should this be forced?
        force = self.args.get("force", False)
        cleanup = self.args.get("cleanup", True)

        # Get the output filename (s3, hdfspath etc.)
        handler = self.get_output_handler(query, params)

        if (not force) and handler.exists(params):
            logger.debug(
                "[Skipping:Exists] {} {}".format(query["name"], params["dt"]),
                extra={"transform": self.name},
            )
            return

        logger.debug(
            "[Computing] {} {}".format(query["name"], params["dt"]),
            extra={"transform": self.name},
        )

        # Create a temp directory
        targetdir = tempfile.mkdtemp(prefix="query_executor_")

        # Process the credentials
        config = get_mysql_config(credentials)

        # Create the environment file
        cnfname = os.path.join(targetdir, "env.sh")
        with open(cnfname, "w") as fd:
            fd.write("[client]\n")
            for var in ["host", "user", "password"]:
                fd.write("{}={}\n".format(var, config[var]))

        # Instantiate the sql
        sqlfile = self.get_file(query["sql"])
        if not os.path.exists(sqlfile):
            raise Exception("Invalid sql file: {}".format(sqlfile))
        sql = open(sqlfile).read()

        # Resolve the sql content
        sql = sql.format(**params)

        sqlname = os.path.join(targetdir, "run.sql")
        with open(sqlname, "w") as fd:
            fd.write(sql)

        # => Now write the script
        tmpname = os.path.join(targetdir, "output.tsv")

        cmd = "mysql --defaults-extra-file={}".format(cnfname)

        # Generate the script to run
        script = """#!/bin/bash\n\n"""
        script += "date\n"
        script += "{} -B < {} > {}\n".format(cmd, sqlname, tmpname)
        script += "date\n"
        script += "[ -s {0} ] && sed -i 's/\\r//g' {0}\n".format(tmpname)
        scriptname = os.path.join(targetdir, "run.sh")
        with open(scriptname, "w") as fd:
            fd.write(script)

        try:
            process = subprocess.Popen(
                ["/bin/bash", scriptname],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            out, err = process.communicate()
            logger.debug(
                "Executed the script",
                extra={
                    "transform": self.name,
                    "data": "Output:\n----\n"
                    + out.decode("utf-8")
                    + "\n\nError:\n-----\n"
                    + err.decode("utf-8"),
                },
            )
        except:
            logger.exception(
                "Error while executing the script",
                extra={
                    "transform": self.name,
                },
            )

        # => Now post-process it..
        handler.process(tmpname, params)

    except:
        logger.exception("Failed to execute", extra={"transform": self.name})

    try:
        if cleanup and (targetdir is not None) and os.path.exists(targetdir):
            shutil.rmtree(targetdir)
        else:
            logger.warning(
                "Targetdir not removed",
                extra=self.config.get_extra(
                    {
                        "transform": self.name,
                        "data": "Targetdir: {}".format(targetdir),
                    }
                ),
            )
    except:
        logger.exception(
            "Cleanup failed",
            extra=self.config.get_extra(
                {"transform": self.name, "data": "Targetdir: {}".format(targetdir)}
            ),
        )
preload_clean_args(args)

Check validity of the args

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def preload_clean_args(self, args):
    """
    Check validity of the args
    """
    args = super().preload_clean_args(args)

    if ("start" not in args) or ("end" not in args):
        raise Exception("Start or end of timeframe missing")

    try:
        start = dateparser.parse(args["start"])
        args["start"] = start
        end = dateparser.parse(args["end"])
        args["end"] = end
    except:
        logger.exception("Invalid start or end", extra={"transform": self.name})
        raise Exception("Invalid start/end datetime specified")

    if (
        ("names" not in args)
        or (not isinstance(args["names"], str))
        or (len(args["names"]) == 0)
    ):
        raise Exception("Invalid list of query names specified")

    # Include force
    force = str(args["force"]).lower().strip()
    force = force == "true"
    args["force"] = force

    # Clean the list of names...
    names = args["names"].split(",")
    names = [n.strip() for n in names if len(n.strip()) > 0]
    args["names"] = [n for n in names if len(n) > 0]

    return args
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Starting Query Execution",
        extra=self.config.get_extra({"transform": self.name}),
    )

    start = self.args["start"]
    end = self.args["end"]

    # Get and validate spec
    spec = self.get_spec()
    self.validate_spec(spec)
    try:
        self.process_spec(spec)
    except:
        logger.exception(
            "Failed while processing spec", extra={"transform": self.name}
        )
        raise

    logger.debug(
        "Completed Query Execution",
        extra=self.config.get_extra({"transform": self.name}),
    )
    ###########################################
    # => Return
    ###########################################
    return state
process_spec(spec)

Process query specification

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def process_spec(self, spec):
    """
    Process query specification
    """

    names = self.args["names"]
    for specitem in spec:

        try:

            itemname = specitem["name"]

            enable = specitem.get("enable", True)
            if not enable:
                logger.error(
                    "Skipping: {}. Not enabled.".format(itemname),
                    extra={"transform": self.name},
                )
                continue

            # What should we be executing to begin with..?
            toexecute = []
            for name in names:
                if name in specitem.get("definitions", {}):
                    toexecute.extend(specitem["definitions"][name])

            for q in specitem["queries"]:
                if (name == "all") or (q["name"] == name):
                    toexecute.append(q["name"])

            # Cleanup
            toexecute = list(set(toexecute))

            if len(toexecute) == 0:
                logger.error(
                    "No parameter list generated: {}".format(itemname),
                    extra={"transform": self.name},
                )
                continue

            # Now process the list of queries. Params will be
            # generated per specitem.
            self.process_specitem(spec, specitem, toexecute)

            logger.debug(
                "Completed execution: {}".format(itemname),
                extra={"transform": self.name},
            )
        except:
            logger.exception(
                "Unable to execute: {}".format(itemname),
                extra={"transform": self.name},
            )
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass
validate_spec(spec)

Check whether specification is valid

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/__init__.py
def validate_spec(self, spec):
    """
    Check whether specification is valid
    """

    if not isinstance(spec, list):
        raise Exception("Query specification should be a list")

    for specitem in spec:

        if not isinstance(specitem, dict):
            logger.error(
                "Query specification items should be dicts",
                extra={
                    "transform": self.name,
                    "data": "Found {}\n{}".format(str(type(specitem)), specitem),
                },
            )
            raise Exception("Invalid specification")

        if "executor" in specitem:
            executor = specitem["executor"]
            if (not callable(executor)) and (
                not (
                    (isinstance(executor, str))
                    and (len(executor) > 0)
                    and (not hasattr(self, executor))
                )
            ):
                raise Exception("Invalid executor specified: {}".format(executor))

        if "generator" in specitem:
            generator = specitem["generator"]
            if (not callable(generator)) and (
                not (
                    (isinstance(generator, str))
                    and (len(generator) > 0)
                    and (not hasattr(self, generator))
                )
            ):
                raise Exception("Invalid generator specified: {}".format(generator))

        expected = ["name", "cred", "queries"]
        missing = [name for name in expected if name not in specitem]
        if len(missing) > 0:
            logger.error(
                "Query specification items should have required elements",
                extra={
                    "transform": self.name,
                    "data": "Missing {} in:\n{}".format(missing, specitem),
                },
            )
            raise Exception("Invalid specification")

        # => check the cred
        try:
            cred = self.get_credentials(specitem["cred"])
        except:
            logger.exception(
                "Unable to find credentials", extra={"transform": self.name}
            )
            raise Exception("Invalid specification")

        for q in specitem["queries"]:
            if (not isinstance(q, dict)) or (len(q) == 0):
                logger.error(
                    "Empty or invalid query specification",
                    extra={"transform": self.name, "data": "Query: {}".format(q)},
                )
                raise Exception("Invalid specification")

            expected = ["name", "sql", "output"]
            missing = [name for name in expected if name not in q]
            if len(missing) > 0:
                logger.error(
                    "Query specification items should have required elements",
                    extra={
                        "transform": self.name,
                        "data": "Missing {} in:\n{}".format(missing, specitem),
                    },
                )
                raise Exception("Invalid specification")

            if not isinstance(q["sql"], str) or len(q["sql"]) == 0:
                logger.error(
                    "Query specification items has invalid sql",
                    extra={
                        "transform": self.name,
                        "data": "SQL: {}".format(q["sql"]),
                    },
                )
                raise Exception("Invalid specification")

            if "generator" in q:
                generator = q["generator"]
                if (not callable(generator)) and (
                    not (
                        (isinstance(generator, str))
                        and (len(generator) > 0)
                        and (not hasattr(self, generator))
                    )
                ):
                    raise Exception(
                        "Invalid generator specified: {}".format(generator)
                    )

        if "definitions" in specitem:
            definitions = specitem["definitions"]
            if (not instance(definitions, dict)) or (len(definitions) == 0):
                logger.error(
                    "Query specification items should have valid definition",
                    extra={
                        "transform": self.name,
                        "data": "Expected non-empty dict. Found:\n{}".format(
                            specitem
                        ),
                    },
                )
                raise Exception("Invalid specification")

            available_names = [q["name"] for q in specitem["queries"]]
            for k, v in definitions.items():
                if (not isinstance(v, list)) or (len(v) == 0):
                    logger.error(
                        "Query specification items should have valid definition",
                        extra={
                            "transform": self.name,
                            "data": "Expected valid non-empty value. Found:\n{}".format(
                                definition
                            ),
                        },
                    )
                    raise Exception("Invalid specification")
                missing = [name for name in v if name not in available_names]
                if len(missing) > 0:
                    logger.error(
                        "Query specification items should have valid definition",
                        extra={
                            "transform": self.name,
                            "data": "Missing: {}\n{}".format(missing, definition),
                        },
                    )
                    raise Exception("Invalid specification")

    # Last check whether the requirements can be satisfied
    names = self.args["names"]

    available_names = ["all"]
    for specitem in spec:
        if "definitions" in specitem:
            available_names.extend(list(specitem["definitions"].keys()))
        for q in specitem["queries"]:
            available_names.append(q["name"])

    missing = [name for name in names if name not in available_names]
    if len(missing) > 0:
        logger.error(
            "Invalid names in args",
            extra={
                "transform": self.name,
                "data": "Missing: {}\nAvailable: {}".format(
                    missing, available_names
                ),
            },
        )
        raise Exception("Invalid specification")

lib

FileOutputHandler

Bases: OutputHandler

Config in this case is a simple path.

OutputHandler(transform, config)

Bases: object

How should output be handled?

Source code in enrichsdk/contrib/lib/transforms/filebased_query_executor/lib.py
def __init__(self, transform, config):
    self.transform = transform
    self.config = config

fileops

File Operations ^^^^^^^^^^^^^^^

FileOperationsBase(*args, **kwargs)

Bases: Trigger

Base class for a FileOperations transform. For now only one action is supported 'copy'. More actions will be added in future.

Example::

    {
        "transform": "FileOperations",
        "enable": true,
        "dependencies": {
           ....
        },
        "args": {
            "actions": [
                {
                    "action": "copy",
                    "src": "%(output)s/%(runid)s/profile.sqlite",
                    "dst": "%(data_root)s/shared/campaigns/profile_daily/profile.sqlite",
                    "backupsuffix": ".backup"
                },
             ]
        }
    }
Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def __init__(self, *args, **kwargs):
    super(FileOperationsBase, self).__init__(*args, **kwargs)
    self.name = "FileOperationsBase"
    self.outputs = {}
    self.dependencies = {}
preload_clean_args(args)

Clean when the spec is loaded...

Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def preload_clean_args(self, args):
    """
    Clean when the spec is loaded...
    """

    # Update the args
    args = super().preload_clean_args(args)

    # Sanity check...
    if not isinstance(args, dict):
        raise Exception("args should be a dictionary")

    if ("actions" not in args) or (not isinstance(args["actions"], list)):
        raise Exception("actions is missing or invalid")

    for a in args["actions"]:
        if not isinstance(a, dict):
            raise Exception("Each action spec should be a dictionary")
        supported = ["copy"]
        if a["action"] not in supported:
            raise Exception("Unsupported action")

        if a["action"] == "copy":
            if ("src" not in a) or ("dst" not in a):
                raise Exception(
                    "Each copy action spec should specify a src and dst"
                )

    return args
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    msg = ""
    actions = self.args["actions"]
    for a in actions:
        action = a["action"]

        # Pass any global variables...
        srcbase = self.config.get_file(a["src"], extra=self.args)
        dstbase = self.config.get_file(a["dst"], extra=self.args)

        if "files" not in a:
            copyactions = [{"src": srcbase, "dst": dstbase}]
        else:
            copyactions = []
            for f in a["files"]:
                copyactions.append(
                    {
                        "src": os.path.join(srcbase, f),
                        "dst": os.path.join(dstbase, f),
                    }
                )

        backupsuffix = a.get("backupsuffix", ".backup")
        data_root = self.config.get_file("%(enrich_data_dir)s")
        for ca in copyactions:
            src = ca["src"]
            dst = ca["dst"]

            if not os.path.exists(src):
                raise Exception("Could not find source: {}".format(src))

            if os.path.exists(dst):
                backupsuffix = datetime.now().strftime(backupsuffix)
                backupdst = dst + backupsuffix
                if os.path.exists(backupdst):
                    if os.path.isdir(backupdst):
                        shutil.rmtree(backupdst)
                    else:
                        os.remove(backupdst)
                os.rename(dst, dst + backupsuffix)

            try:
                os.makedirs(os.path.dirname(dst))
            except:
                pass

            # Handle the directory names
            if os.path.isdir(src):
                shutil.copytree(src, dst)
            else:
                shutil.copy(src, dst)

            msg += "Copy: {} => {}\n".format(
                os.path.relpath(src, data_root), os.path.relpath(dst, data_root)
            )

    logger.debug(
        "{} - Completed".format(self.name),
        extra=self.config.get_extra({"transform": self.name, "data": msg}),
    )

    ###########################################
    # => Return
    ###########################################
    return state
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/fileops/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

inmemory_query_executor

InMemoryQueryExecutorBase(*args, **kwargs)

Bases: AnonymizerMixin, Compute

Base class for an InMemory QueryExecutor transform. This is useful to run queries against backends such as backends such as mysql

Features of transform baseclass include:

* Support multiple query engines (via SQLAlchemy)
* Support templatized execution
* Support arbitrary number of queries
* Supports a generator function to generate per-interval queries

Configuration looks like::

...
"args": {
    "cleanup": False,
    "force": True,
    "targets": "all",
    "start_date": "2020-08-01",
    "end_date": "2020-08-03",
}

Specs

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "QueryExecutorBase"
    self.description = "Execute queries against backends"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
generic_clean(df)

Do a high level clean of the query result before doing a query-specific clean

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def generic_clean(self, df):
    """
    Do a high level clean of the query result before
    doing a query-specific clean
    """
    return df
get_engine(spec)

Build and return an engine for a given specification.

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_engine(self, spec):
    """
    Build and return an engine for a given specification.
    """
    raise Exception("Construct sqlalchemy engine")
get_registry()

Build a registry and return

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_registry(self):
    """
    Build a registry and return
    """
    return None
get_specs()

Use get_sql_specs instead.

.. warning:: .. deprecated:: 2.6.0

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_specs(self):
    """
    Use get_sql_specs instead.

    .. warning::
        .. deprecated:: 2.6.0

    """

    return []
get_specs_from_sqls(sqldir)

Helper function. Load specifications from the SQLs.

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_specs_from_sqls(self, sqldir):
    """
    Helper function. Load specifications from the SQLs.
    """
    specs = []

    files = glob.glob(sqldir + "/*.sql")
    for f in files:
        name = os.path.basename(f).replace(".sql", "")
        sql = open(f).read()

        # Specify the split in the SQL itself..
        segment = None
        match = re.search(r"-- segment:: (\S+)", sql)
        if match is not None:
            segment = match.group(1).strip()

        match = re.search(r"-- name:: (\S+)", sql)
        if match is not None:
            name = match.group(1).strip()

        match = re.search(r"-- engine:: (\S+)", sql)
        if match is not None:
            engine = match.group(1).strip()

        specs.append(
            {"name": name, "sql": sql, "segment": segment, "engine": engine}
        )

    return specs
get_sql_specs()

Return a list of query specifications.

Specification: A list of dictionaries. Each dict has

  • name: Name of the specification
  • sql: SQL template
  • categories: String or a list of strings indicating specification groups
  • segment: How to split the dataframe resulting from query execution. Could be none ('complete' as the default name), string (column name) or a callback that generates a { name: df } map
  • paramsets_duration: each instance for one 'day' or a window of days (defined below)
  • paramsets_window: each instance translates into date range for each instance of parameters.

Examples::

Simple: { "name": "txn_value", "sql": "txn_value.sql", "segment": "global_date", }

Simple:

 {
     "categories": ["kyc"],
     "name": "kyc_txn_summary",
     "sql": "kyc_txn_summary.sql",
     "segment": complex_split_callbak,
     "paramsets_duration": "day",
     "retries": 3,
 },
Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_sql_specs(self):
    """
    Return a list of query specifications.

    Specification: A list of dictionaries. Each dict has

      * name: Name of the specification
      * sql: SQL template
      * categories: String or a list of strings indicating specification groups
      * segment: How to split the dataframe resulting from query execution. Could be none ('complete' as the default name), string (column name) or a callback that generates a { name: df } map
      * paramsets_duration: each instance for one 'day' or a window of days (defined below)
      * paramsets_window: each instance translates into date range for each instance of parameters.

    Examples::

       Simple:
         {
             "name": "txn_value",
             "sql": "txn_value.sql",
             "segment": "global_date",
         }

       Simple:

         {
             "categories": ["kyc"],
             "name": "kyc_txn_summary",
             "sql": "kyc_txn_summary.sql",
             "segment": complex_split_callbak,
             "paramsets_duration": "day",
             "retries": 3,
         },

    """

    self.get_specs()
get_supported_extra_args()

Look at the specs to generate a list of options that can be presented to the end-ser

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def get_supported_extra_args(self):
    """
    Look at the specs to generate a list of options that
    can be presented to the end-ser
    """

    # Collect specs first..
    specs = self.get_sql_specs()

    # Compute the targets
    targets = ["all"]  # default
    for s in specs:
        if not s.get("enable", True):
            continue
        categories = s.get("categories", s.get('category', []))
        if isinstance(categories, str):
            categories = [categories]
        for c in categories:
            if c not in targets:
                targets.append(c)
    for s in specs:
        name = s["name"]
        if name not in targets:
            targets.append(name)
    targets = "|".join(targets)

    # Now construct the args dynamically
    remaining = self.supported_extra_args
    return [
        {
            "name": "targets",
            "description": f"What all to run. Specify multiple with comma separating names ({targets})",
            "default": "all",
            "required": False,
        },
        {
            "name": "force",
            "description": "Force execution",
            "default": "False",
            "required": False,
        },
        {
            "name": "start_date",
            "description": "Start of the time window",
            "default": get_yesterday(),
            "required": True,
        },
        {
            "name": "end_date",
            "description": "End of the time window",
            "default": get_today(),
            "required": True,
        },
    ] + remaining
preload_clean_args(args)

Check validity of the args

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def preload_clean_args(self, args):
    """
    Check validity of the args
    """
    args = super().preload_clean_args(args)

    if ("start_date" not in args) or ("end_date" not in args):
        raise Exception("Start or end of timeframe missing")

    try:
        start = dateparser.parse(args["start_date"]).date()
        args["start_date"] = start
        end = dateparser.parse(args["end_date"]).date()
        args["end_date"] = end
    except:
        logger.exception(
            "Invalid start_date or end_date", extra={"transform": self.name}
        )
        raise Exception("Invalid start/end datetime specified")

    if (
        ("targets" not in args)
        or (not isinstance(args["targets"], str))
        or (len(args["targets"]) == 0)
    ):
        raise Exception("Invalid list of query names specified")

    # Include force
    force = str(args["force"]).lower().strip()
    force = force == "true"
    args["force"] = force

    # Clean the list of names...
    targets = args["targets"].split(",")
    targets = [n.strip() for n in targets if len(n.strip()) > 0]
    args["targets"] = [n for n in targets if len(n) > 0]

    return args
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the registry
    self.registry = self.get_registry()

    # => Initialize anonymization if required
    if 'anonymization' in self.args:
        self.anonymize_init(self.args['anonymization'])

    # List of specification names
    targets = self.args["targets"]

    # Get specs..
    specs = self.get_sql_specs()

    logger.debug(f"Specs found: {len(specs)}", extra={"transform": self.name})
    # Now iterate through the specs.
    for spec in specs:
        try:

            name = spec["name"]
            categories = spec.get("categories",
                                  spec.get('category', ['all']))
            if isinstance(categories, str):
                categories = [categories]
            table = spec.get("table", name)
            cond = spec.get("cond", "")
            retries = spec.get("retries", 1)

            # To take care of the logging in case of exception
            msg = f"Name: {name}\n"

            # Check if this has been requested?
            if all([c not in targets for c in categories]) and (
                name not in targets
            ):
                continue

            logger.debug(
                f"Executing {spec['name']}",
                extra={
                    "transform": self.name,
                    "data": json.dumps(spec, indent=4, cls=SafeEncoder),
                },
            )

            sql_template = spec["sql"]

            files = []

            paramsets = self.generate_paramsets(
                spec, self.args["start_date"], self.args["end_date"]
            )

            for params in paramsets:

                status = []

                msg = f"Params: {params}\n"
                msg += f"Insert Table: {table} with {cond}\n"

                # Now log the SQL
                sql = sql_template % params
                msg += "SQL:\n{}\n".format(sql)

                # Get the engine for a given spec
                engine = self.get_engine(spec)

                segmentcol = spec.get("segment", None)

                tryno = 1
                while True:
                    if tryno > retries:
                        raise Exception("Exceeded max retries")

                    try:
                        df = pd.read_sql(sql, engine)
                        break
                    except:
                        logger.exception(
                            f"Failed Query: {name} (try {tryno})",
                            extra={"transform": self.name, "data": msg},
                        )
                    tryno += 1
                    time.sleep(30)

                # Do some basic cleaning. int becomes float
                df = self.generic_clean(df)

                msg += f"Segment: {segmentcol} (Initial split)\n"
                msg += "Records: {}\n".format(df.shape[0])
                msg += "Columns: {}\n".format(", ".join(df.columns))
                msg += "Dtypes: " + df.dtypes.to_string() + "\n"

                skip_empty = spec.get("skip_empty", True)
                if len(df) == 0:
                    # no data returned...
                    if skip_empty:
                        logger.warning(
                            f"Completed {name} {params['start_date']} No data",
                            extra={"transform": self.name, "data": msg},
                        )
                        continue
                    else:
                        logger.warning(
                            f"{name} {params['start_date']} No data",
                            extra={"transform": self.name, "data": msg},
                        )

                if ((len(df) == 0) and (not callable(segmentcol))):
                    msg = """Dont know how to handle an empty dataframe. Not sure what columns should be included with what values. segmentcol should be a callable""" + msg
                    logger.warning(f"{name} {params['start_date']} Skipping",
                                   extra={
                                       "transform": self.name,
                                       "data": msg
                                   })
                    continue

                # First gather a map of segments
                filemap = {}
                if segmentcol is None:
                    # Whole thing is one segment
                    filemap["complete"] = df
                elif isinstance(segmentcol, str):
                    # Split by column name...
                    segments = list(df[segmentcol].unique())
                    msg += f"Segments: {len(segments)} ({segmentcol})\n"
                    for segment in segments:
                        try:
                            df1 = df[df[segmentcol] == segment]
                            segment = str(segment)
                            filemap[segment] = df1
                        except:
                            pass
                elif callable(segmentcol):
                    # Custom split of the dataframe...
                    filemap = segmentcol(self, spec, params, df)
                    msg += f"Segments: {len(filemap)}\n"
                else:
                    raise Exception(f"Unhandled segment definition: {segmentcol}")

                # => Process each segment obtained from
                # the previous step...
                for segment, df1 in sorted(filemap.items()):

                    # Add note about what is being stored..
                    msg += f"[{segment}] {df1.shape[0]} records\n"

                    # Clean the output data...
                    try:
                        if "clean" in spec:
                            callback = spec["clean"]["callback"]
                            if callable(callback):
                                clean_msg, df1, clean_files = callback(
                                    self, segmentcol, segment, df1, spec
                                )
                                if len(clean_msg) > 0:
                                    msg += f"[{segment}] " + clean_msg + "\n"
                                files += clean_files
                    except Exception as e:
                        #traceback.print_exc()
                        msg += str(e)
                        raise

                    # Store in database...
                    try:
                        extra_dependencies = []
                        if "store" in spec:
                            # Separate storage handler..
                            callback = spec["store"]["callback"]
                            if callable(callback):
                                store_msg, store_dependencies, store_files = callback(
                                    self, segmentcol, segment, df1, spec
                                )
                                if len(store_msg) > 0:
                                    msg += f"[{segment}] " + store_msg + "\n"
                                extra_dependencies += store_dependencies
                                files += store_files

                    except Exception as e:
                        #traceback.print_exc()
                        msg += str(e)
                        raise

                    # Handle a default store for all specs, segments
                    try:


                        # => Anonymize the data
                        if hasattr(self, 'anonargs'):
                            anon_df1 = self.anonymize_target(spec['name'], df=df1)
                        else:
                            anon_df1 = None

                        # Store in s3 etc.
                        store_msg, store_dependencies, store_files  = self.store(
                            segmentcol, segment, df1, anon_df1, spec
                        )
                        if len(store_msg) > 0:
                            msg += f"[{segment}] " + store_msg

                        extra_dependencies += store_dependencies
                        files += store_files

                        # update lineage
                        self.update_frame(
                            name, engine, sql, df1, extra_dependencies
                        )


                    except Exception as e:
                        #traceback.print_exc()
                        msg += "[{}] Exception {}\n".format(segment, traceback.format_exc()) + "\n"

                logger.debug(
                    f"Completed {name} {params['start_date']}",
                    extra={"transform": self.name, "data": msg},
                )

            # Make note of it.
            dataset = self.registry.find(spec['name'])
            if ((dataset is not None) and (len(files) > 0)):
                metadata = { 'files': files}
                self.registry.access(dataset, metadata, nature='write')

        except:
            #traceback.print_exc()
            # Exception for each spec.
            logger.exception(
                f"Unable to run query: {name}",
                extra={"transform": self.name, "data": msg},
            )
            msg = ""
            continue

    self.add_marker(state)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state
update_frame(name, engine, sql, df, dependencies=[])

Note the lineage for each output file.

Source code in enrichsdk/contrib/lib/transforms/inmemory_query_executor/__init__.py
def update_frame(self, name, engine, sql, df, dependencies=[]):
    """
    Note the lineage for each output file.
    """

    # Check if it has already been registered
    if self.state.has_frame(name):
        return

    # Get the default database
    database = engine.url.database

    # Insert extra dependencies
    try:
        dependencies += get_lineage_of_query(engine, sql)
    except:
        dependencies = []
        logger.warning("Unable to get lineage",
                         extra={
                             'transform': self.name,
                             'data': f"SQL being checked:\n {sql}"
                         })

    # Generate column information...
    columns = self.get_column_metadata(name, df)

    ## => Gather the update parameters
    updated_detail = {
        "df": df,
        "description": f"Output for query {name}",
        "transform": self.name,
        "frametype": "pandas",
        "params": [
            {
                "type": "compute",
                "columns": columns,
            },
        ],
    }

    if len(dependencies) > 0:
        lineage = {"type": "lineage", "dependencies": dependencies}
        updated_detail['params'].append(lineage)

    # Dump it into the shared state
    self.state.update_frame(name, updated_detail, create=True)

metrics

MetricsBase(*args, **kwargs)

Bases: Compute

Compute metrics as input for the anomaly/other computation

Features of transform baseclass include:

* Flexible configuration
* Highlevel specification of dimensions and metrics
Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "MetricsBase"
    self.description = "Compute metrics against datasources"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
get_dataset_generic(source)

Use the dataset object to read the dataset

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_dataset_generic(self, source):
    """
    Use the dataset object to read the dataset
    """

    if (not hasattr(self, "read_data")) or (not hasattr(self, "get_dataset")):
        raise Exception(
            " get_dataset_generic expects read_data and get_dataset methods"
        )

    args = self.args

    start_date = source.get('start_date', self.args["start_date"])
    end_date = source.get('end_date', self.args["end_date"])

    name = source["name"]
    dataset = source["dataset"]
    params = source.get("params", {})
    filename = source.get('filename', 'data.csv')

    cache = args.get("cache", False)
    cachename = f"{dataset}-{start_date}-{end_date}-{filename}"
    cachefile = f"cache/{self.name}-cache-{cachename}"

    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            logger.debug(
                "Read cached {}".format(name), extra={"transform": self.name}
            )

            df = pd.read_csv(cachefile, **params)
            return {name: df}

    datasetobj = self.get_dataset(dataset)

    if hasattr(self, 'update_doodle'):
        self.update_doodle(datasetobj, source['filename'])

    df, metadata = datasetobj.read_data(
        start_date,
        end_date,
        filename=source["filename"],
        readfunc=self.read_data,
        params=params,
    )

    logger.debug("Read {}".format(name), extra={"transform": self.name})

    # Cache it for future use...
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if ("files" in metadata) and (len(metadata["files"]) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [metadata["files"][-1]],
                },
            ],
        }

    self.update_frame(source, df, lineage)

    return {name: df}
get_datasets(profile, specs)

Load the datasets specified by the profile

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_datasets(self, profile, specs):
    """
    Load the datasets specified by the profile
    """

    if not isinstance(profile, dict) or len(profile) == 0:
        logger.warning("Empty profile", extra={"transform": self.name})
        return {}

    # Get various kinds of handlers..
    handlers = self.get_handlers(profile)
    if not isinstance(handlers, dict) or len(handlers) == 0:
        logger.warning("No handlers specified", extra={"transform": self.name})
        handlers = {}

    required_sources = []
    for s in specs:
        required_sources.extend(s["sources"])
    required_sources = list(set(required_sources))

    # Now no about constructucting the datasets
    datasets = {}

    found = []
    sources = self.get_sources(profile)
    for source in sources:

        nature = source.get("nature", "db")
        name = source["name"]

        if name not in required_sources:
            continue
        found.append(name)

        pipeline = source.get("pipeline", None)
        generate = source.get("generate", None)

        # Only db is used for now...
        try:
            if nature == "db":
                result = self.read_db_source(source)
            elif (
                (generate is not None)
                and (generate in handlers)
                and (callable(handlers[generate]))
            ):
                result = handlers[generate](source)
            elif (generate is not None) and (hasattr(self, generate)):
                result = getattr(self, generate)(source)
            else:
                raise Exception(f"Invalid specification: {name}")
        except:
            logger.exception(
                f"[{name}] generation failed", extra={"transform": self.name}
            )
            continue

        # Clean the read the dataset...
        try:
            if pipeline is not None and isinstance(pipeline, list):
                for processor in pipeline:
                    if isinstance(processor, str):
                        if processor in handlers:
                            result = handlers[processor](result, source)
                        elif hasattr(self, processor):
                            result = getattr(self, processor)(result, source)
                        else:
                            raise Exception(f"Missing post-processor: {processor}")
                    elif callable(processor):
                        result = processor(result, source)
                    else:
                        raise Exception(
                            "Only method names/callables are supported are supported"
                        )
        except:
            logger.exception(
                f"[{name}] post-processing failed", extra={"transform": self.name}
            )
            continue

        # We could return multiple values or a single value
        if isinstance(result, dict):
            datasets.update(result)
        else:
            datasets[name] = result

    missing = [s for s in required_sources if s not in found]
    if len(missing) > 0:
        logger.error(
            f"Missing {len(missing)} sources",
            extra={
                "transform": self.name,
                "data": ", ".join(missing)
            }
        )
        raise Exception("Missing sources")

    return datasets
get_db_uri(source)

Return database URI for a source

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_db_uri(self, source):
    """
    Return database URI for a source
    """
    return source["uri"]
get_handlers(profile)

Define various callbacks that take a dataframe, spec and compute. Specific to a single profile.

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_handlers(self, profile):
    """
    Define various callbacks that take a dataframe, spec
    and compute. Specific to a single profile.
    """
    return {}
get_printable_db_uri(engine)

pretty print the URL

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def get_printable_db_uri(self, engine):
    """
    pretty print the URL
    """
    username = engine.url.username
    host = engine.url.host
    database = engine.url.database
    drivername = engine.url.get_driver_name()

    return f"{drivername}:///{host}/{database}/"
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, None)
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    # Get specs..
    specs = self.get_specs(profile)

    # First get the datasets
    datasets = self.get_datasets(profile, specs)

    # Now go through each spec and get the output..
    for spec in specs:
        enable = spec.get("enable", True)
        if not enable:
            continue
        self.process_spec(datasets, profile, spec)

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state
process_spec_default(datasets, profile, spec)

Handle one specification at a time..

Source code in enrichsdk/contrib/lib/transforms/metrics/__init__.py
def process_spec_default(self, datasets, profile, spec):
    """
    Handle one specification at a time..
    """

    if ("dimensions" not in spec) or (not isinstance(spec["dimensions"], dict)):
        raise Exception("Dimensions in spec should be a dict")

    if ("metrics" not in spec) or (not isinstance(spec["metrics"], dict)):
        raise Exception("Metrics in spec should be a dict")

    # Get hold of the data first...
    sources = self.get_spec_sources(spec, datasets)

    if len(sources) > 1:
        raise Exception("Use custom spec handler for multiple sources")

    datasetdf = list(sources.values())[0]

    # now go through each of the dimensions
    dimensions = spec["dimensions"]
    metrics = spec["metrics"]

    _dfs = []
    for name, cols in dimensions.items():

        if isinstance(cols, str):
            cols = [cols]

        # Dont need to include other columns...
        relevant = cols + list(metrics.keys())
        df = datasetdf[relevant]

        # Check if there are lists and explode them...
        for col in cols:
            if isinstance(df.iloc[0][col], list):
                df = df.explode(col)

        # Construct aggregates...
        df = df.groupby(cols)
        df = df.agg(metrics)

        # Clean up the index if multiple columns are specified
        if len(cols) > 1:
            df.index = df.index.map("+".join)
        df.index.name = "value"
        df = df.reset_index()

        # Also cleanup the column names...
        def clean_colname(what):
            if isinstance(what, (list, tuple)):
                what = "_".join(what)
                what = what.rstrip("_").lstrip("_")
            return what

        df.columns = df.columns.map(clean_colname)

        df.insert(0, "dimensions", name)

        _dfs.append(df)

    # merge all
    df = pd.concat(_dfs)
    del _dfs

    return {spec["name"]: df}

notebook_executor

NotebookExecutorBase(*args, **kwargs)

Bases: Compute

A built-in transform baseclass to handle standard notebook operation and reduce the duplication of code.

Features of this transform include:

* Support for custom args and environment
* Support for automatic capture and surfacing of output and err

Configuration looks like::

 class MyTestNotebook(NotebookExecutorBase):

     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.name = "TestNotebook"
         self.notebook = os.path.join(thisdir, "Test-Notebook.ipynb")

     @classmethod
     def instantiable(cls):
         return True

     def get_environment(self):
         return {
             'SECRET': credentials
         }
Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "NotebookExecutorBase"
    self.notebook = None
    self._environ = os.environ.copy()
get_environment()

Pass any additional parameters...

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def get_environment(self):
    """
    Pass any additional parameters...
    """
    return {}
get_notebook()

Define notebook that must be executed

Returns:

str: Path to the notebook
Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def get_notebook(self):
    """
    Define notebook that must be executed

    Returns:

        str: Path to the notebook
    """
    if (
        (not hasattr(self, "notebook"))
        or (self.notebook is None)
        or (not isinstance(self.notebook, str))
        or (not os.path.exists(self.notebook))
    ):
        raise Exception(
            "Missing notebook. Missing/invalid path: {}".format(
                getattr(self, "notebook", "")
            )
        )

    notebook = self.notebook
    notebook = os.path.abspath(notebook)
    return notebook
preload_clean_args(args)

Standard args preprocessor. Make sure that an artifacts directory is created for storing the configuration file, output notebook and stdout/err.

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def preload_clean_args(self, args):
    """
    Standard args preprocessor. Make sure that
    an artifacts directory is created for storing the
    configuration file, output notebook and stdout/err.

    """
    args = super().preload_clean_args(args)

    # Insert artifacts if not available..
    if "artifacts" not in args:
        args["artifacts"] = self.get_file(
            "%(output)s/%(runid)s/artifacts", create_dir=True
        )
        try:
            os.makedirs(args["artifacts"])
        except:
            pass

    return args
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "{} - process".format(self.name),
        extra=self.config.get_extra({"transform": self.name}),
    )

    config = self.args
    configfile = os.path.join(config["artifacts"], "config.json")
    dump = lambda: json.dumps(config, indent=4, default=str)
    with open(configfile, "w") as fd:
        fd.write(dump())

    logger.debug(
        "Parameters to script",
        extra={
            "transform": self.name,
            "data": "Config: {}\n---\n".format(configfile) + dump(),
        },
    )

    # Update the environ
    _environ = os.environ.copy()
    try:
        # Update the environ
        update = self.get_environment()
        os.environ.update(update)

        # Now run the notebook
        self.run_notebook(config, configfile)

    finally:
        os.environ.clear()
        os.environ.update(_environ)

    return state
validate_results(what, state)

Check to make sure that the execution completed correctly

Source code in enrichsdk/contrib/lib/transforms/notebook_executor/__init__.py
def validate_results(self, what, state):
    """
    Check to make sure that the execution completed correctly
    """
    pass

observability

DataObserverBase(*args, **kwargs)

Bases: Compute

Monitor an input data source given a spec

Features of transform baseclass include: * Flexible configuration * Highlevel specification of observability: * specified data source * custom defined testing conditions for observability * custom defined output of observability results * notification of observability results on success/failure

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "DataObserverBase"
    self.description = "Monitor an input data source given a spec"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
get_dataset_s3(spec)

Use the dataset object to read the dataset

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def get_dataset_s3(self, spec):
    """
    Use the dataset object to read the dataset
    """
    run_date    = self.args['run_date']
    name        = spec["name"]
    config      = spec['config']
    source      = config['source']

    for f in ["dataset", "filename"]:
        if f not in source:
            msg = f"{f} param needed in config source" + "\n"
            logger.exception(
                f"Dataset: {name} -- skipping", extra={"transform": self.name, "data": msg}
            )
            return None

    dataset_type    = source['type']
    dataset         = source['dataset']
    pieces          = dataset.split('-')
    dataset_main    = "-".join(pieces[:-1])
    dataset_subset  = pieces[-1]
    filename        = source["filename"]
    params          = source.get("params", {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-anonymizer-cache-" + cachename + ".csv"

    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            msg += note(df, f"Cached {dataset}") + "\n"
            logger.debug(f"Read cached {name}", extra={"transform": self.name, "data": msg})
            return df

    if dataset_type == "registry":
        if not hasattr(self, "get_dataset"):
            raise Exception(
                "get_dataset_s3 expects get_dataset method"
            )
        datasetobj = self.get_dataset(dataset_main) # this method should be defined in the derived class

        if hasattr(self, 'update_doodle'):
            self.update_doodle(datasetobj, source['filename'])

        df, metadata = datasetobj.read_data(
            run_date,
            run_date,
            filename=filename,
            readfunc=self.read_s3_data,
            params=params,
        )
    elif dataset_type == "direct":
        df = self.read_s3_data(filename, params)
        metadata = { "files": [filename] }
    else:
        logger.exception(
            f"Unknown source param: {dataset_type}, skipping", extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
        )
        return None

    msg = note(df, f"Fresh {dataset}") + "\n"
    logger.debug(f"Read fresh {name}", extra={"transform": self.name, "data": msg})

    # Cache it for future use...
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if ("files" in metadata) and (len(metadata["files"]) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [metadata["files"][-1]],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/observability/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "observability")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # get the dataset lookup table
    customer_datasets = profilespec.construct_dataset_list(self, specs)

    # Now go through each spec and process it
    for spec in specs:

        ## first, some checks on the spec
        do_process_spec = True
        name = spec.get('name', 'NO_SPEC_NAME')

        enabled = spec.get("enable", True)
        if not enabled:
            logger.debug(
                f"Spec not enabled, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            do_process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.exception(
                    f"Spec has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue

        config = spec['config']

        for f in ["source", "checks", "store"]:
            if f not in config:
                logger.exception(
                    f"Spec config has no {f} param, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                do_process_spec = False
                break
        if do_process_spec == False:
            continue


        ## we can now proceed with processing the spec
        # frist, load the source data
        data = self.load_dataset(spec, customer_datasets)

        # then, process it
        results = self.process_spec(spec, data)
        if results is None:
            continue

        ## notify the observability result
        results = self.notify_result(spec, results, data)

        ## store the observability result and notification status
        self.store_result(spec, results, data)

        # update frame for pipline
        description = spec.get("desc", f"{name} observability results")
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [spec.get("filename", "__NEW__")],
                },
            ],
        }
        self.update_frame(
            spec,
            description,
            results,
            lineage,
        )

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

synthetic_data_generator

SyntheticDataGeneratorBase(*args, **kwargs)

Bases: Compute

Generate synthetic data given a specification

Features of transform baseclass include: * Flexible configuration * Highlevel specification of synthetic data in each column * instance: pre-defined faker-based instances * distribution: pre-defined from statistical distributions * custom: custom defined in base/derived class

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "SyntheticDataGeneratorBase"
    self.description = "Generate synthetic data from a specification"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }
    self.fakeObj = None # for synthetic data generation, to be inited later on
anon_email(data, col_name, column)

Method to anonymize email data. Can generate emails to match or not match data in some name field. Also respects original email domain distribution if required. Input is the full dataframe, output is the relavant column being anonymized.

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anon_email(self, data, col_name, column):
    '''
    Method to anonymize email data. Can generate emails to match or not match
    data in some name field. Also respects original email domain distribution if required.
    Input is the full dataframe, output is the relavant column being anonymized.
    '''
    msg = ""

    match_names = column.get("match_names", True)
    if match_names is True:
        if "name_field" not in column:
            msg += f"Column {col_name} -- Unknown name field to match emails, setting random emails" + "\n"
            match_names = False
            return np.nan
        else:
            if column["name_field"] not in data.columns:
                msg += f"Column {column['name']} -- name field not in dataframe, setting random emails" + "\n"
                match_names = False
                return np.nan

    def generate_email(fakeObj, row, col_name, column, match_names):
        # whitelist of email domains
        # if the origninal email is in this list, don't replace it
        # useful to maintain data distribution
        domain_whitelist = ['gmail.com',
                            'yahoo.com',
                            'hotmail.com',
                            'aol.com']

        email_col_name  = col_name
        orig_domain     = row[email_col_name].split('@')[1]

        # set the email domain first
        if column.get("dist", "yes") == "yes":
            # we need to ensure that the distribution of generated email domains
            # match what was present in the input
            # popular free email domains will carry over, while others will be
            # replaced with random domains while still retaining distribution
            if any([d==orig_domain for d in domain_whitelist]):
                # retain the origninal domain name
                domain = orig_domain
            else:
                # get a new domain name
                domain = fakeObj['generators']['email_domain'][orig_domain]
        else:
            # no need to match distribution of generated email domains
            domain = fakeObj['faker'].ascii_email().split('@')[1]

        if match_names is True:
            # we want to match the anon email with the name field
            name = row[column['name_field']]
            names = unidecode.unidecode(name).lower().split(' ')
        else:
            # we don't care about matching the anon email with the name field
            names = fakeObj['faker'].name().split(' ')

        firstname = names[0]
        lastname = names[-1]

        # possible variations of email
        nameparts = {
            1: f"{firstname}",
            2: f"{lastname}",
            3: f"{firstname}.{lastname}",
            4: f"{firstname}.{firstname[0]}.{lastname}",
            5: f"{firstname}.{lastname[0]}.{lastname}",
            6: f"{firstname}.{firstname[0]}.{lastname[0]}",
            7: f"{firstname}.{random.randint(1,10000)}",
            8: f"{firstname}_{random.randint(1,10000)}",
            9: f"{firstname}.{lastname}.{random.randint(1,10000)}",
        }
        choice = random.randint(1, len(nameparts))
        namepart = nameparts[choice]
        email = f"{namepart}@{domain}"

        return email

    val = data.apply(lambda x: generate_email(self.fakeObj, x, col_name, column, match_names), axis=1)

    return val
anon_numeric(data, col_name, column)

Method to fuzz numeric data. Various fuzzing methods can be defined here. Input is the full dataframe, output is the relavant column being fuzzed.

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anon_numeric(self, data, col_name, column):
    '''
    Method to fuzz numeric data. Various fuzzing methods
    can be defined here.
    Input is the full dataframe, output is the relavant column being fuzzed.
    '''
    msg = ""

    method      = column.get("method", "perturb")
    params      = column.get("params", {})

    val = data[col_name]

    if method == "perturb":
        range = params.get("range", 0.05)
        val += random.uniform(-range*val, range*val)
    else:
        msg = f"Column {column['name']} -- Unknown method to anon column, setting default NaNs" + "\n"
        val = np.nan

    return val
anonymize_dataset(spec, data)

Anonymize a dataset given a spec

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anonymize_dataset(self, spec, data):
    '''
    Anonymize a dataset given a spec
    '''
    msg = ""
    name    = spec['name']
    config  = spec['config']

    # whether to anonymize all columns or a spec is defined
    columns_to_anon = "all" if "columns" not in config else "given"

    df_columns = data.columns
    columns = config.get('columns', {})

    # run through each column and try to anonymize
    anon_columns = []
    for col_name, col_obj in columns.items():
        include = col_obj.get("include", "yes")
        if include != "yes":
            continue
        params = {}
        if col_name not in df_columns:
            msg += f"Column: {col_name} not found, skipping" + "\n"
        else:
            data[col_name], l_msg = self.anonymize_single_column(col_name, col_obj, data, params)
            anon_columns.append(col_name)
            msg += l_msg

    # drop the other columns is required by spec
    action = config.get("nontransformed", "retain")
    if action == "drop":
        data = data[anon_columns]

    msg += note(data, "Anonymized dataset") + "\n"

    logger.debug(
        f"Spec: {name} dataset anonymized",
        extra={"transform": self.name, "data": msg}
    )

    return data
anonymize_single_column(col_name, col_obj, data, params)

Takes a dataset and anonymizes the specified column

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def anonymize_single_column(self, col_name, col_obj, data, params):
    '''
    Takes a dataset and anonymizes the specified column
    '''
    msg = ""

    # get the faker object
    fakeObj = self.fakeObj

    # setup handlers for the various anonymization types
    generators = {}
    # first for the lookup generators
    for g, lookup in fakeObj['generators'].items():
        generators[g] = {
            "type": "lookup",
            "handler": lookup
        }
    # then for the custom generators
    generators["numeric"] = {
        "type": "custom",
        "handler": "anon_numeric"
    }
    generators["email"] = {
        "type": "custom",
        "handler": "anon_email"
    }

    anon_type = col_obj['anon_type']
    _d = []
    if anon_type in generators:
        gen_type = generators[anon_type]['type']
        gen_handler = generators[anon_type]['handler']
        if gen_type == "lookup":
            # we call the apply only on the specific column
            data = data[col_name].apply(lambda x: gen_handler[x])
        else:
            handler = getattr(self, gen_handler)
            # we call the apply to the full dataframe, we may need other columns
            # return is only the relevant column
            data = handler(data, col_name, col_obj)
        msg += f"Column: {col_name} anonymized" + "\n"
    else:
        data = np.nan
        msg += f"Column: {col_name} -- No <{anon_type}> generator found, defaulting to NaN" + "\n"

    return data, msg
get_dataset_s3(spec)

Use the dataset object to read the dataset

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def get_dataset_s3(self, spec):
    """
    Use the dataset object to read the dataset
    """
    run_date    = self.args['run_date']
    name        = spec["name"]
    config      = spec['config']


    for f in ["dataset", "filename"]:
        if f not in config:
            msg = f"{f} param needed in config " + "\n"
            logger.exception(
                f"Dataset: {name} -- skipping", extra={"transform": self.name, "data": msg}
            )
            return None

    source      = config.get('source', 'registry')
    dataset     = config['dataset']

    dataset     = config["dataset"]
    pieces      = dataset.split('-')
    dataset_main = "-".join(pieces[:-1])
    dataset_subset = pieces[-1]
    filename    = config["filename"]
    params      = config.get("params", {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{run_date}"
    cachefile = f"cache/{self.name}-anonymizer-cache-" + cachename + ".csv"

    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            msg += note(df, f"Cached {dataset}") + "\n"
            logger.debug(f"Read cached {name}", extra={"transform": self.name, "data": msg})
            return df

    if source == "registry":
        if not hasattr(self, "get_dataset"):
            raise Exception(
                "get_dataset_s3 expects get_dataset method"
            )
        datasetobj = self.get_dataset(dataset_main) # this method should be defined in the derived class

        if hasattr(self, 'update_doodle'):
            self.update_doodle(datasetobj, filename)

        df, metadata = datasetobj.read_data(
            run_date,
            run_date,
            filename=filename,
            readfunc=self.read_s3_data,
            params=params,
        )
    elif source == "direct":
        params = {}
        df = self.read_s3_data(filename, params)
        metadata = { "files": [filename] }
    else:
        logger.exception(
            f"Dataset: {name} -- unknown source param: {source}, skipping", extra={"transform": self.name}
        )
        return None

    msg = note(df, f"Fresh {dataset}") + "\n"
    logger.debug(f"Read fresh {name}", extra={"transform": self.name, "data": msg})

    # Cache it for future use...
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if ("files" in metadata) and (len(metadata["files"]) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": [metadata["files"][-1]],
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
process(state)

Run the computation and update the state

Source code in enrichsdk/contrib/lib/transforms/synthetic_data_generator/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # init the faker object for data generation
    self.fakeObj = self.init_faker_object()

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "syntheticdata")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")

    # get the dataset lookup table
    customer_datasets = profilespec.construct_dataset_list(self, profile)

    # Now go through each dataset and generate synthetic data for it
    for spec in specs:

        process_spec = True

        enabled = spec.get("enable", True)
        if not enabled:
            logger.debug(
                f"Spec <{spec.get('name', 'NO NAME')}> not enabled, skipping.",
                extra={"transform": self.name}
            )
            process_spec = False
            continue

        for f in ["name", "config"]:
            if f not in spec:
                logger.error(
                    f"Spec has no {f} param set, skipping.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
                process_spec = False
                break

        if process_spec == False:
            # something is wrong with this spec, skip it
            continue

        # process the spec
        name    = spec['name']
        action  = spec.get('action', 'anonymize')

        if action == 'generate':
            # we have a generate a synthetic dataset
            frametype = "synthetic"
            data = self.generate_dataset(spec)
        elif action == 'anonymize':
            # we have to anonymize a given dataset
            frametype = "anonymized"
            # frist, load it
            data = self.load_dataset(spec, customer_datasets)

            # then, anonymize it
            if data is not None:
                data = self.anonymize_dataset(spec, data)
            else:
                msg = "Could not anonymize dataset" + "\n"
                logger.exception(
                    f"Spec: {spec['name']} -- skipping",
                    extra={"transform": self.name}
                )
        else:
            logger.exception(
                f"Unknown action param in spec, skipping spec: {spec['name']}",
                extra={"transform": self.name}
            )

        # store the generated dataset
        if data is not None:
            self.store_result(spec, data)

            # update frame for pipline
            description = spec.get(f"desc -- {frametype}", f"{frametype.title()} generated dataset")
            lineage = {
                "type": "lineage",
                "transform": self.name,
                "dependencies": [
                    {
                        "type": "file",
                        "nature": "input",
                        "objects": [spec.get("filename", "__NEW__")],
                    },
                ],
            }
            self.update_frame(
                spec,
                description,
                data,
                lineage,
            )

    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state

timeseries_forecaster

TimeSeriesForecasterBase(*args, **kwargs)

Bases: Compute

Take a timeseries and project it's future values with exogenous variables. Features of transform baseclass include: * Flexible configuration * Highlevel specification of time series forecasting * specified data source or custom method to generate one * by default, forecast using facebook's prophet library or custom defined ones using other libraries

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.name = "TimeSeriesForecasterBase"
    self.description = "Forecast future values of a timeseries"
    self.testdata = {
        "data_root": os.path.join(os.environ["ENRICH_TEST"], self.name),
        "statedir": os.path.join(os.environ["ENRICH_TEST"], self.name, "state"),
        "conf": {"args": {}},
        "data": {},
    }

    self.default_strategy = "prophet"
    self.default_type = "vanilla"

    self.epoch = time.time()    #for output path
combined_dataset(spec, data)

Adds the combined dataset to the data dict

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def combined_dataset(self, spec, data):
    """
    Adds the combined dataset to the data dict
    """
    config = spec['config']
    combined_dataset = pd.DataFrame()

    if "combine_sources" in config:
        combine_sources = config["combine_sources"]
        dataset = combine_sources.get("dataset", None)

        if hasattr(self, dataset):
            params = combine_sources.get("params", {})
            handler = getattr(self, dataset)
            combined_dataset =  handler(params, data, spec)
            data['combined'] = combined_dataset

            msg = note(combined_dataset, "Combined dataset")
            logger.debug(f"Combined dataset for {spec['name']}",
                         extra={"transform": self.name, "data": msg})

    return data
get_dataset_s3(spec, source, paths, start_date, end_date)

Gets all files from paths and puts them together into a single dataframe. If self.args['cache']==True, then this consolidated dataframe is cached / read from cache as applicable.

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def get_dataset_s3(self, spec, source, paths, start_date, end_date):
    '''
    Gets all files from paths and puts them together
    into a single dataframe. If self.args['cache']==True,
    then this consolidated dataframe is cached / read from cache
    as applicable.
    '''
    msg = ""

    run_date    = self.args['run_date']
    config      = spec['config']
    dataset     = source['dataset']
    params      = source.get('params', {})

    cache = self.args.get("cache", False)
    cachename = f"{dataset}-{start_date}-to-{end_date}"
    cachefile = f"cache/{self.name}-rawdata-cache-" + cachename + ".csv"

    # read from cache if available
    if cache:
        try:
            os.makedirs(os.path.dirname(cachefile))
        except:
            pass
        if os.path.exists(cachefile):
            msg = f"Location: {cachefile}" + "\n"
            df = pd.read_csv(cachefile, **params)
            logger.debug(f"Read cached {dataset}", extra={"transform": self.name, "data": msg})
            return df

    # read from S3
    dfs = []
    for path in paths:
        _df = self.read_s3_data(path, params)
        if _df is None:
            msg += f"Path error, skipping: {path}" + "\n"
            continue
        msg += f"Read from path: {path}" + "\n"
        dfs.append(_df)
    df = pd.concat(dfs)

    logger.debug(f"Read fresh {dataset}", extra={"transform": self.name})

    # Cache it for future use
    if cache:
        df.to_csv(cachefile, index=False)

    # Insert lineage if possible
    lineage = None
    if (len(paths) > 0):
        lineage = {
            "type": "lineage",
            "transform": self.name,
            "dependencies": [
                {
                    "type": "file",
                    "nature": "input",
                    "objects": paths,
                },
            ],
        }

    if not self.state.has_frame(spec['name']):
        self.update_frame(spec, f"Dataset: {dataset}", df, lineage)

    return df
get_datewindow(source, spec)

Set the time window for observations and exogenous variables. Get both of these from args parameters if not start_date defaults to 60 days prior to end date end_date is day prior to run_date, which is usually today

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def get_datewindow(self, source, spec):
    """
    Set the time window for observations and exogenous variables.
    Get both of these from args parameters
    if not start_date defaults to 60 days prior to end date
    end_date is day prior to run_date, which is usually today
    """
    datewindow = {}
    default_delta = 60

    run_date = self.args['run_date']

    try:
        if 'end_date' in self.args and self.args['end_date']:
            end_date = datetime.fromisoformat(self.args['end_date'])
        else:
            logger.debug(
                f"End date not in args. Using yesterday's date.")
            end_date = run_date - timedelta(days=1)

        if 'start_date' in self.args and self.args['start_date']:
            start_date = datetime.fromisoformat(self.args['start_date'])
        else:
            logger.debug(
                f"Start date not in args. Using {default_delta} days prior to end date. ")
            start_date = end_date - timedelta(days=default_delta)
    except Exception as e:
        logger.exception(
            f"Error parsing date window for {spec['name']}.",
            extra={"transform": self.name, "data": self.args}
        )
        datewindow = None
        return datewindow

    if start_date > end_date:
        logger.exception(
                    f"Start date greater than end date. Skipping the spec {spec['name']}.",
                    extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
                )
        datewindow = None # be explicit
        return datewindow

    datewindow['start_date'] = start_date
    datewindow['end_date'] = end_date

    return datewindow
get_handlers(spec)

Define various callbacks that take a dataframe, spec and compute.

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def get_handlers(self, spec):
    """
    Define various callbacks that take a dataframe, spec
    and compute.
    """
    return {}
load_source(spec)

Load all the sources to a 'data' dict modifies the 'data' dict.

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def load_source(self, spec):
    """
    Load all the sources to a 'data' dict
    modifies the 'data' dict.
    """
    config = spec['config']
    source = config['source']

    data = {}
    is_valid = True

    if 'observations' not in source:
        logger.exception(
            f"Spec config has no observations param, skipping.",
            extra={"transform": self.name, "data": json.dumps(spec, indent=4)})
        is_valid_spec = False
        return is_valid

    if 'exovars' not in source:
        logger.debug(
            f"Exogenous variables not specified in {spec['name']}",
            extra={"transform": self.name, "data": json.dumps(spec, indent=4)})

    # get time_window for observations and exovars
    datewindow = self.get_datewindow(source, spec)
    if datewindow is None:
        logger.debug(
            f"Invalid date window for {spec['name']}",
            extra={"transform": self.name})
        is_valid_spec = False
        return is_valid, data

    data['observations'] = {}
    for dataname, dataspec in source['observations'].items():
        dataset = self.load_dataset(spec, dataname, dataspec, datewindow)
        data["observations"][dataname] = dataset

    # then load the exovars data set if specified
    if "exovars" in source:
        data['exovars'] = {}
        for dataname, dataspec in source['exovars'].items():
            dataset = self.load_dataset(spec, dataname, dataspec, datewindow)
            data["exovars"][dataname] = dataset

    return is_valid, data
postprocess_results(spec, result)

Postprocess the results. The method defined in the subclass

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def postprocess_results(self, spec, result):
    """
    Postprocess the results. The method defined in the subclass
    """
    config = spec['config']
    # do post_process results
    postprocess_results = config.get('postprocess_results', None)
    if postprocess_results:
        method = postprocess_results.get('method', "")
        params = postprocess_results.get('params', {})
        handler = getattr(self, method, None)
        if handler:
            result = handler(spec, result, params)
        else:
            logger.exception(
                f"Spec: {spec['name']} -- postprocess_results method not found",
                extra={"transform": self.name}
            )
    logger.debug(f"Postprocess results for {spec['name']} done",
                 extra={"transform": self.name})

    return result
precheck_spec(spec)

Check if the spec is valid

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def precheck_spec(self, spec):
    '''
    Check if the spec is valid
    '''
    is_valid_spec = True
    name = spec.get('name', 'NO_SPEC_NAME')

    enabled = spec.get("active", True)
    if not enabled:
        logger.debug(
            f"Spec not enabled, skipping.",
            extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
        )
        is_valid_spec = False
        return is_valid_spec

    for f in ["name", "config"]:
        if f not in spec:
            logger.exception(
                f"Spec has no {f} param, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            is_valid_spec = False
            return is_valid_spec

    config = spec['config']

    for f in ["source", "forecasters"]:
        if f not in config:
            logger.exception(
                f"Spec config has no {f} param, skipping.",
                extra={"transform": self.name, "data": json.dumps(spec, indent=4)}
            )
            is_valid_spec = False
            return is_valid_spec

    return is_valid_spec
process(state)

Run the computation and update the state 1. Load the datasets 2. Run forecasting 3. process the forecasting results 4. store the results

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def process(self, state):
    """
    Run the computation and update the state
    1. Load the datasets
    2. Run forecasting
    3. process the forecasting results
    4. store the results
    """
    logger.debug(
        "Start execution", extra=self.config.get_extra({"transform": self.name})
    )

    # Will be used in other places..
    self.state = state

    # Get the profile spec
    is_valid, profile, msg = profilespec.get_profile(self, "policyapp.forecasting")
    if is_valid:
        logger.debug(
            f"Loaded profilespec",
            extra={"transform": self.name, "data": msg}
        )
    else:
        logger.error(
            f"Could not load profilespec",
            extra={"transform": self.name, "data": msg}
        )
        raise Exception("could not load profilespec")
    specs = profile.get("specs", None)
    if specs is None:
        raise Exception("Could not find 'specs' in profile")

    # Now go through each spec and process it
    for spec in specs:

        do_process_spec = self.precheck_spec(spec)
        if do_process_spec == False:
            continue

        ## we can now proceed with processing the spec
        # load source
        do_process_spec, data = self.load_source(spec)
        if do_process_spec == False:
            continue

        # post process the sources
        data = self.combined_dataset(spec, data)

        # run the forecasters
        result = self.process_spec(spec, data)
        if result is None:
            continue

        # postprocess the results
        result = self.postprocess_results(spec, result)

        # tag the result under the spec name
        result = {spec['name']: result}

        # store the  results
        self.store_result(spec, result)


    # Done
    logger.debug(
        "Complete execution", extra=self.config.get_extra({"transform": self.name})
    )

    ###########################################
    # => Return
    ###########################################
    return state
process_spec(spec, data)

Process the forecaster spec. generate result and chart for each forecaster

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def process_spec(self, spec, data):
    """
    Process the forecaster spec.
    generate result and chart for each forecaster
    """

    def store_chart(tsf):
        '''
        We generate multiple charts for each forecaster
        Immeidately store the charts as we generate
        '''
        viz = tsf['viz']
        filename = f"{forecaster_name}-{name}-forecasting.png"
        msg = self.store_viz(spec, filename, viz)
        tsf.pop('viz', None)
        return msg

    msg = ""
    name = spec["name"]
    config = spec["config"]

    forecasters = config['forecasters']

   # if forecasters is not a dict
    if not isinstance(forecasters, dict):
        logger.exception("Forecasters must be a dict",
                         extra={"transform": self.name})
        raise Exception("Forecasters must be a dict")

    forecasters = config['forecasters']

    result = {"forecasts": {}}
    for forecaster_name, forecaster in forecasters.items():

        tsf = self.run_forecasting(spec, data, forecaster_name, forecaster)
        msg+= store_chart(tsf)

        logger.debug(f"Processed and then aved visualization for {forecaster_name}",
                extra={"transform": self.name, "data": msg})

        result['forecasts'][forecaster_name] = tsf

    logger.debug(f"Done processing all the forecasters",
            extra={"transform": self.name})

    return result
run_forecasting(spec, data, forecaster_name, forecaster)

Instantiate the forecaster and run forecasting

Source code in enrichsdk/contrib/lib/transforms/timeseries_forecaster/__init__.py
def run_forecasting(self, spec, data, forecaster_name, forecaster):
    """
    Instantiate the forecaster and run forecasting
    """
    # default is prophet
    # type is vanilla
    strategy = forecaster.get('strategy', self.default_strategy)
    type = forecaster.get('type', self.default_type)
    params = forecaster.get('params', {})

    # return timeseries forecast
    tsf = {}
    chart_params = params.get('chart_params', {})

    if strategy == 'prophet':

        if type == "vanilla":
            observation  = params.get('observation', None)

            if observation is None:
                logger.exception(f"Observation time series must be specified for forecaster: {forecaster_name}",
                                extra={"transform": self.name, "data": json.dumps(forecaster, indent=4)})
                raise Exception("Observation must be specified for prophet forecaster")

            df = data['observations'][observation]

            forecast_obj = BaseProphetForecasterModel(df)
            forecast = forecast_obj.run_forecasting(params)

            viz = forecast_obj.visualize_forecasting(forecast, chart_params)
            del forecast_obj

        elif type ==  "exogenous":
            df = data['combined']

            forecast_obj = BaseProphetForecasterModel(df)
            forecast = forecast_obj.run_forecasting(params)
            viz = forecast_obj.visualize_forecasting(forecast, chart_params)
            del forecast_obj

        else:
            logger.excption(f"Invalid type for prophet forecaster: {forecaster_name}",
                            extra={"transform": self.name, "data": json.dumps(forecaster, indent=4)})
            raise Exception("Invalid type for prophet forecaster")

    tsf = {
        "forecast" : forecast,
        "viz" : viz,
    }
    msg = note(forecast, f"Forecast for {forecaster_name}")
    logger.debug(f"Forecasted time series for {forecaster_name}",
                    extra={"transform": self.name, "data": msg})

    return tsf