Bases: AgentEnablersMixin
Class to take in a dataframe and create a data dictionary for it
Class to classify data columns using SADL
Parameters:
- entities_filepath (str): The file path to the entities mapping file in JSON format. Default is None.
- temperature (int): Control the temperature of the inference/API
- max_tokens (int): Control the max_tokens in the inference/API
Output:
Use classify_columns method
Source code in llmsdk/agents/sadl.py
| def __init__(self,
cred={},
platform=defaults.LLM_PLATFORM,
model=defaults.LLM_MODEL,
entities_filepath=None,
max_tokens=1024,
temperature=0):
"""
Class to classify data columns using SADL
Parameters:
- entities_filepath (str): The file path to the entities mapping file in JSON format. Default is None.
- temperature (int): Control the temperature of the inference/API
- max_tokens (int): Control the max_tokens in the inference/API
Output:
Use classify_columns method
"""
start_time = time.time()
# init
self.context = None
self.agent_name = "sadl"
self.max_tokens = max_tokens
self.temperature = temperature
self.data = None
self.samplesize = 5
self.platform = platform
self.model = model
# init the base entity map
self.entity_map = self._get_base_entity_mapping()
# init the base industry list
self.industries = self._get_base_industries()
# load additional entities if needed
if entities_filepath:
self.entity_map = self.load_entities(entities_filepath)
# init the llm and embeddings objects
self.llm, self.embeddings = self._get_llm_objs(platform=self.platform,
model=self.model,
embedding_model=self.embedding_model,
cred=self.cred)
self.logger = logging.getLogger("app")
self.agent_id = self._create_id(f"{self.agent_name}_{start_time}")
# set metadata
self.metadata = {
"agent": {
"name": self.agent_name,
"id": self.agent_id,
"platform": self.platform,
"model": self.model
},
"events": []
}
# log that the agent is ready
duration = time.time() - start_time
event = self._log_event(agent_events._EVNT_READY, duration)
|
classify_columns(context='')
Classify the columns.
Parameters:
- context (str, optional): The context for classification. If not provided, the default context of the class instance is used.
Returns:
- success (bool): Indicates whether the classification was successful.
- result_type (str): The type of the classification result. Can be 'json' for JSON format or 'str' for string format.
- result (str or dict): The classification result. If 'result_type' is 'json', it's a dictionary; otherwise, it's a string.
Source code in llmsdk/agents/sadl.py
| def classify_columns(self, context=""):
"""
Classify the columns.
Parameters:
- context (str, optional): The context for classification. If not provided, the default context of the class instance is used.
Returns:
- success (bool): Indicates whether the classification was successful.
- result_type (str): The type of the classification result. Can be 'json' for JSON format or 'str' for string format.
- result (str or dict): The classification result. If 'result_type' is 'json', it's a dictionary; otherwise, it's a string.
"""
# set start_time
start_time = time.time()
# set the context for this labelling attempt
context = self.context if context=="" else context
# get the prompt for the LLM
prompt = self.generate_prompt_columns()
params = self.generate_prompt_params(context)
chain = prompt | self.llm
try:
# chain and prompt
success = True
resp = chain.invoke(params)
response = resp.content
try:
result = json.loads(response.lower().strip())
result_type = 'json'
except:
result = response.strip()
result_type = 'str'
except:
success = False
result = ""
result_type = ""
# get prompt
prompt_string = self.generate_prompt_columns_string()
params = {
"query": str(prompt_string),
"mode": self.data_mode,
"success": success,
"result_type": result_type,
"result": result
}
# logging the result
duration = time.time() - start_time
event = self._log_event(agent_events._EVNT_QUERY, duration, params=params)
# process the response
return success, result_type, result
|
classify_industry()
figure out what industry the dataframe is from
Source code in llmsdk/agents/sadl.py
| def classify_industry(self):
"""
figure out what industry the dataframe is from
"""
# get the prompt for the LLM
prompt = self.generate_prompt_industry()
params = self.generate_prompt_industry_params(self.df)
chain = prompt | self.llm
# chain
resp = chain.invoke(params)
response = resp.content
result = response.lower().strip()
# process the response
return result
|
generate_prompt_columns()
generate a prompt for labelling a dataframe given some context
Source code in llmsdk/agents/sadl.py
| def generate_prompt_columns(self):
"""
generate a prompt for labelling a dataframe given some context
"""
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a data entry operator.
Your task is to construct a data dictionary for a set of database column names given to you.
The database is from a company in {context} industry.""",
),
("human",
"""{map_partial}
Classify the following column names into entity and sub_entity using the above mentioned details.
Also provide the following for each column:
- datatype
- true/false indicating whether the column could contain personally identifiable information (PII)
- description
Format your output as a nested json dictionary as follows:
{output_format}
Here are the input column names:
{labels}"""),
]
)
return prompt
|
generate_prompt_columns_string()
generate prompt in string format
Source code in llmsdk/agents/sadl.py
| def generate_prompt_columns_string(self):
"""
generate prompt in string format
"""
prompt = self.generate_prompt_columns()
params = self.generate_prompt_params()
prompt_string = prompt.invoke(params)
return prompt_string
|
generate_prompt_industry()
generate a prompt for identifying the industry of a dataframe given some context
Source code in llmsdk/agents/sadl.py
| def generate_prompt_industry(self):
"""
generate a prompt for identifying the industry of a dataframe given some context
"""
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are a data entry operator.
Assume you have a list of industries:
{industries}""",
),
(
"human",
"""Your task is to identify what industry a dataset belongs to given the columns in the dataset.
Respond with EXACTLY one option from the list of industries.
Here are the columns in the dataset:
{labels}"""),
]
)
return prompt
|
load_data(content, source='df')
- source (str): The type of input data. Can be "df" for a DataFrame input or "csv" for a CSV file input. Default is "df".
- content: data file path (if source=='str') or DataFrame (if source=='df').
Source code in llmsdk/agents/sadl.py
| def load_data(self, content, source="df"):
"""
- source (str): The type of input data. Can be "df" for a DataFrame input or "csv" for a CSV file input. Default is "df".
- content: data file path (if source=='str') or DataFrame (if source=='df').
"""
if source == "csv":
data = pd.read_csv(content,
on_bad_lines='skip',
encoding='unicode_escape')
elif source == "df" and not content.empty:
data = content
else:
return False
self.data = {}
for col in data.columns:
self.data[col] = list(data[col].astype(str).head(self.samplesize).values)
return self.data
|
load_entities(entities)
load any additional entity mappings and add them to the
base entity map
entities: path to entity mappings
Source code in llmsdk/agents/sadl.py
| def load_entities(self, entities):
"""
load any additional entity mappings and add them to the
base entity map
entities: path to entity mappings
"""
# get the base entity map
entity_map = self.entity_map
# check path
if os.path.exists(entities) and entities.endswith(".json"):
with open(entities, "r") as fd:
addl_emap = json.load(fd)
if isinstance(addl_emap, dict):
for e, se in addl_emap.items():
# lowercase everything
e = e.lower()
# we need lists
if isinstance(se, str):
se = [se.lower()]
# lowercase everything
if isinstance(se, list):
se = [i.lower() for i in se]
else:
# we cannot add this item, move on
continue
# we can now add this mapping to the base map
entity_map[e].extend(se)
# make sure we remove duplicates
entity_map[e] = list(set(entity_map[e]))
# done, we have the updated map
self.entity_map = entity_map
else:
pass
return self.entity_map
|
map_to_targets(data, targets, use_content=False)
map input column names to a defined set of target classes
- data: input data to map, must come from the self.load_data(...) method
- targets: list of target classes
- use_content: if True, send a sample of data content values to LLM to do the mapping
set this to True using caution, or data leakage is possible
Source code in llmsdk/agents/sadl.py
| def map_to_targets(self, data, targets, use_content=False):
"""
map input column names to a defined set of target classes
- data: input data to map, must come from the self.load_data(...) method
- targets: list of target classes
- use_content: if True, send a sample of data content values to LLM to do the mapping
set this to True using caution, or data leakage is possible
"""
def construct_prompt(data, targets, use_content):
if use_content == False:
task_subprompt = """You will be given a list of input columns as well as a list of target classes.
Your task is to map each column in the input dataframe to one entry in the list of target classes.
"""
if use_content == True:
task_subprompt = """You will be given a column name, some sample values from that column, as well as a list of target classes.
Your task is to map the column name to one entry in the list of target classes. Use the sample values to guide your decision.
"""
# construct the prompt template
# this is the system message part
sys_msg = f"""You are a highly advanced, AI-enabled, data mapping tool.
{task_subprompt}
Format your output as a json dictionary as follows:
{{"input": "target"}}
"""
# this is the human message part
if use_content == False:
human_subprompt = f"""Here are the input columns names
------ BEGIN COLUMN NAMES ------
{data}
------- END COLUMN NAMES -------
"""
if use_content == True:
human_subprompt = f"""Here is the column name and values
------ BEGIN COLUMN AND VALUES ------
{data}
------- END COLUMN AND VALUES -------
"""
human_msg = f"""{human_subprompt}
Here are the target classes:
------ BEGIN TARGET CLASSES ------
{targets}
------- END TARGET CLASSES -------"""
prompt = [
SystemMessage(content=sys_msg),
HumanMessage(content=human_msg),
]
return prompt
def execute_prompt(prompt):
try:
if self.platform in ['openai', 'azure']:
with get_openai_callback() as cb:
response = self.llm(prompt)
response = response.content
stats = {
"total_tokens": cb.total_tokens,
"prompt_tokens": cb.prompt_tokens,
"completion_tokens": cb.completion_tokens,
"total_cost": round(cb.total_cost, 4)
}
success = True
except:
success = False
result = {
"success": success,
"response": response
}
return result
# run the query
result = {
"success": False,
"columns": {}
}
if use_content == True:
for column, values in data.items():
column_str = f"{column}: {', '.join(values)}"
prompt = construct_prompt(column_str, targets, use_content)
res = execute_prompt(prompt)
if res.get("success"):
result["success"] = True
d = json.loads(res.get("response"))
key = list(d.keys())[0]
result['columns'][column] = d.get(key)
else:
columns = list(data.keys())
prompt = construct_prompt(columns, targets, use_content)
result = execute_prompt(prompt)
result["success"] = True
result['columns'] = json.loads(result.get("response"))
d = result.pop("response")
return result
|