Skip to content

DocGen

DocGen enables generation of documents such as Memos, surveys, and other research material that is useful in business discussions and presentations.

LLMDocGenerator(name, cred={}, model='openai', searchapi='serpapi')

Bases: object

Class to generate text documents using LLMs A set of documents that act as context against which the agent has to generate text

init the LLM query agent name: name of the agent cred: credentials object model: name of the model backend to use default to OpenAI GPT model for now will be extended in the future to suuport other models searchapi: name of the search API backend to use default to serpapi for now

Source code in llmsdk/agents/docgen.py
def __init__(self,
             name,
             cred={},
             model="openai",
             searchapi="serpapi"):
    """
    init the LLM query agent
    name: name of the agent
    cred: credentials object
    model: name of the model backend to use
            default to OpenAI GPT model for now
            will be extended in the future to suuport other models
    searchapi: name of the search API backend to use
                default to serpapi for now
    """

    start_time = time.time()

    # defaults
    self.chunk_size = 1000
    self.chunk_overlap = 100
    self.latest_context = []
    self.context_topK = 1
    self.current_kg = []
    self.metadata = {}
    self.index = None
    self.vdb_client = None
    self.index_name = None
    self.store = None

    # name
    self.agent_name = name

    # creds
    self.cred = cred
    # LLM params
    self.model = model
    self.searchapi = searchapi
    self.chaintype = "stuff"

    # init the llm and embeddings objects
    self.llm, self.embeddings = self._get_llm_objs(model=self.model,
                                                   cred=self.cred)

    # init the QnA chain for internal queries
    prompt = self._get_query_prompt_answer()
    self.llm_chain = load_qa_chain(self.llm,
                                   chain_type=self.chaintype,
                                   prompt=prompt)

    # note metadata for this agent
    self.metadata = {
        "agent": {
            "name": self.agent_name,
            "model": self.model,
            "searchapi": self.searchapi,
            "chaintype": self.chaintype,
        },
        "events": []
    }
    # log that the agent is ready
    duration = time.time() - start_time
    event = self._log_event(agent_events._EVNT_READY, duration)

add_to_index(data)

add document(s) to the agent's index

Source code in llmsdk/agents/docgen.py
def add_to_index(self, data):
    """
    add document(s) to the agent's index
    """
    start_time = time.time()

    if self.index:
        if self.store == 'chroma':
            self.add_to_index_chroma(data)
        else:
            raise Exception(f"{self.store} does not support adding document")
    else:
        raise Exception("No available index, cannot add document")

    # log that the doc is added
    if self.index:
        duration = time.time() - start_time
        params = {
            "n_items": len(data),
        }
        event = self._log_event(agent_events._EVNT_INDEXADD, duration, params=params)

    return

add_to_index_chroma(data)

add document(s) to a chromadb index

Source code in llmsdk/agents/docgen.py
def add_to_index_chroma(self, data):
    """
    add document(s) to a chromadb index
    """
    # first, delete all existing docs from the same sources
    # as what we are adding, we don't want duplicates
    self._delete_docset_chroma(self.index, data)

    # now, add the new docs
    self._add_docset_chroma(self.index, data)

    # persist the db to disk
    self.vdb_client.persist()

    return

chunk_data(data)

create chunks from the data and add any needed metadata

Source code in llmsdk/agents/docgen.py
def chunk_data(self, data):
    """
    create chunks from the data
    and add any needed metadata
    """

    def cleanup_metadata(data):
        # take in a list of data document objects
        # and clean up metadata
        curr_source = None
        for i in range(0, len(data)):
            source = data[i].metadata['source']
            data[i].metadata['file'] = source.split('/')[-1]
            if curr_source != source:
                curr_source = source
                chunk = 1
            data[i].metadata['chunk'] = chunk
            data[i].metadata['id'] = self._create_id(f"{source}-{chunk}")
            chunk += 1

            ##
            ## add any other custom metadata here
            ##

        return data

    # chunk the data
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size,
                                                   chunk_overlap=self.chunk_overlap)
    chunks = text_splitter.split_documents(data)

    # add metadata
    chunks = cleanup_metadata(chunks)

    return chunks

create_add_index(data, store='chroma', persist_directory=None, index_name=None)

create an index from a data source data: list of langchain Document() objects store: type of vectorstore to use (chroma, faiss, ...)

Source code in llmsdk/agents/docgen.py
def create_add_index(self, data, store='chroma', persist_directory=None, index_name=None):
    """
    create an index from a data source
    data: list of langchain Document() objects
    store: type of vectorstore to use (chroma, faiss, ...)
    """

    start_time = time.time()

    # note what store we are using and the index name
    self.store = store
    self.index_name = self.agent_name if not index_name else index_name

    # create the index
    if store == 'faiss':
        self.index = FAISS.from_documents(data, self.embeddings)
    elif store == 'chroma':
        self.index = self.create_add_index_chroma(data, persist_directory=persist_directory)
    else:
        self.index = None
        self.store = None

    # log that the index is ready
    if self.index:
        duration = time.time() - start_time
        params = {
            "store": store,
            "persist_directory": persist_directory,
            "index_name": self.index_name,
            "n_items": len(data)
        }
        event = self._log_event(agent_events._EVNT_INDEXCREATE, duration, params=params)

    return

create_add_index_chroma(data, persist_directory=None)

Init the chromadb index and populate it with a set of documents

Source code in llmsdk/agents/docgen.py
def create_add_index_chroma(self, data, persist_directory=None):
    """
    Init the chromadb index and populate it with a set of documents
    """
    # init the ChromaDB client
    self.vdb_client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
    ))

    # make sure we're starting with a fresh db and collection
    self.vdb_client.reset()
    index = self.vdb_client.get_or_create_collection(name=self.index_name,
                                                     embedding_function=self.embeddings.embed_documents)

    # populate the collection
    self._add_docset_chroma(index, data)

    # persist the index to disk
    if persist_directory:
        self.vdb_client.persist()

    return index

generate_doc(profilespec='profilespec.json')

generate the document

Source code in llmsdk/agents/docgen.py
def generate_doc(self, profilespec="profilespec.json"):
    """
    generate the document
    """

    start_time = time.time()

    # load the spec first
    self.spec = self.load_spec(profilespec)
    if self.spec is None:
        return None

    # get prefix, suffix
    prefix = self.spec.get('prompt', {}).get('prefix', "")
    suffix = self.spec.get('prompt', {}).get('suffix', "")

    # get sections
    sections = self.spec.get('sections')

    full_text = "-- BEGIN: Investor Memo --" + "\n\n\n"

    for section, detail in sections.items():

        enable = detail.get('enable', True)
        if not enable:
            continue

        # form the section header
        header = detail['title']
        header_text = f'{header}' + "\n" + '_'*len(header) + "\n\n"

        # run through each question for that section
        section_text = ""
        section_sources = []
        section_keywords = []
        for query in detail['questions']:
            # query using the agent
            query = f"{prefix} {query} {suffix}"
            prompt = { "query": query }
            result = self.prompt(prompt, mode="answer")
            answer = result['answer']
            sources = result['sources']
            keywords = result['keywords']
            section_sources += [source['source'] for source in sources]
            section_keywords += keywords

            # form the answer for this question
            section_text += f'{answer}' + "\n\n"

        # summarize the section if needed
        collate = detail.get('collate')
        if collate:
            intent = detail.get('intent', detail.get('title'))
            prompt = {
                "text": section_text,
                "intent": intent,
                "collate": collate,
            }
            result = self.prompt(prompt, mode="collate")
            section_text = result['answer']
            section_text += "\n\n"
            section_keywords = result['keywords']

        # add the keywords for this section
        section_keywords = list(set(section_keywords))
        section_text += "Keywords: " + ", ".join(section_keywords) + "\n\n"

        # add the sources for this section
        section_sources = list(set(section_sources))
        section_text += "Sources: " + ", ".join(section_sources) + "\n\n\n"

        # create the section text for this section
        full_text += f'{header_text}{section_text}'

    full_text += "\n-- Investor Memo :END --"

    # log the event
    params = {
        "spec": self.spec
    }
    duration = time.time() - start_time
    event = self._log_event(agent_events._EVNT_DOCGEN, duration, params=params)

    return full_text

generate_query_prompt_collation(text, intent, method)

generate a prompt for summarizing text

Source code in llmsdk/agents/docgen.py
def generate_query_prompt_collation(self, text, intent, method):
    """
    generate a prompt for summarizing text
    """

    if method == 'summarize':
        template = """Given the following long text (given in backquotes below), summarize it to be {intent}.

        ```
        {text}
        ```

        Summary:
        """
    elif method == 'rewrite':
        template = """Rewrite the following text (given in backquotes below), {intent}, to make it more coherent.

        ```
        {text}
        ```

        Response:
        """
    else:
        template = None

    prompt = PromptTemplate(
        input_variables=["intent", "text"],
        template=template,
    )

    return prompt.format(intent=intent, text=text)

generate_query_prompt_kwords(context='')

generate a prompt for extracting keywords from a paragraph

Source code in llmsdk/agents/docgen.py
def generate_query_prompt_kwords(self, context=""):
    """
    generate a prompt for extracting keywords from a paragraph
    """
    template = """You are the chief of staff to a busy executive. You will be given a paragraph of text
    and must identify at most five key phrases from the paragraph. Do not summarize what you find,
    respond with the key phrases exactly. Structure your response as a json list exactly.
    DO NOT RESPOND WITH ANY KEY PHRASES FROM CONTENT ABOVE THIS LINE.

    Here is the paragraph:
    {context}

    The key phrases are:
    """

    prompt = PromptTemplate(
        input_variables=["context"],
        template=template,
    )

    return prompt.format(context=context)

get_index()

return the index object

Source code in llmsdk/agents/docgen.py
def get_index(self):
    """
    return the index object
    """
    return self.index

get_index_stats()

return some stats about the agent's index

Source code in llmsdk/agents/docgen.py
def get_index_stats(self):
    """
    return some stats about the agent's index
    """
    stats = None
    if self.index:
        try:
            stats = {
                "name": self.index_name,
                "store": self.store,
                "n_items": self.index.count()
            }
        except:
            raise Exception("Index does not support stats")
    return stats

get_metadata()

return metadata collected by the agent

Source code in llmsdk/agents/docgen.py
def get_metadata(self):
    """
    return metadata collected by the agent
    """
    return self.metadata

get_similar_docs(query, topk=7)

get top-K similar docs from an index given a query query: query string index: index object topk: number of top-K similar docs to matching query to return

Source code in llmsdk/agents/docgen.py
def get_similar_docs(self, query, topk=7):
    """
    get top-K similar docs from an index given a query
    query: query string
    index: index object
    topk: number of top-K similar docs to matching query to return
    """
    if self.index:
        if self.store == 'faiss':
            docs = self.index.similarity_search(query,
                                                k=topk,
                                                include_metadata=True)
        elif self.store == 'chroma':
            docs = self.search_chromadb(query,
                                        k=topk,
                                        include_metadata=True)
        else:
            docs = None

    return docs

load_data(source, content, metadata={}, params={})

set the datasource loader, loads and cleans the data source: 'dir' points to a folder with one or more files to load 'pdf' points to a single PDF file to load 'str' contains a text string to load content: data content (path, text) depending on type of source params: extra params needed for specific loaders glob: what glob filter to use if source=='dir' pdfloader: what type of pdf loader module to use if source=='pdf' metadata: any custom metadata to pass when source=='str'

Source code in llmsdk/agents/docgen.py
def load_data(self, source, content, metadata={}, params={}):
    """
    set the datasource loader, loads and cleans the data
    source:
        'dir' points to a folder with one or more files to load
        'pdf' points to a single PDF file to load
        'str' contains a text string to load
    content: data content (path, text) depending on type of source
    params: extra params needed for specific loaders
                glob: what glob filter to use if source=='dir'
                pdfloader: what type of pdf loader module to use if source=='pdf'
    metadata: any custom metadata to pass when source=='str'
    """

    start_time = time.time()

    if source == 'dir':
        glob = params.get("glob", "**/*.*")
        loader = DirectoryLoader(content, glob=glob, recursive=True)
        data = loader.load()

    elif source == 'pdf':
        pdfloader = params.get("pdfloader", "pymupdf")
        if pdfloader == "pymupdf":
            loader = PyMuPDFLoader(content)
            data = loader.load()
        elif pdfloader == "pypdf":
            loader = PyPDFLoader(content)
            data = loader.load_and_split()
        elif pdfloader == "pypdfium2":
            loader = PyPDFium2Loader(content)
            data = loader.load()
        elif pdfloader == "pdfminer":
            loader = PDFMinerLoader(content)
            data = loader.load()
        else:
            data = None

    elif source == 'str':
        # special handling for string inputs
        metadata["source"] = source
        data = [Document(page_content=content, metadata=metadata)]

    else:
        data = None

    # chunk the data
    data = self.chunk_data(data)

    # log that the data loader is ready
    duration = time.time() - start_time
    params = {
        "source": source,
        "content": content,
        "params": params,
        "metadata": metadata,
    }
    event = self._log_event(agent_events._EVNT_DATA, duration, params=params)

    return data

load_index(persist_directory, index_name, store='chroma')

load an already persisted index from a directory persist_directory: location of persisted index store: type of vectorstore to use (chroma, ...) only supports chroma for now

Source code in llmsdk/agents/docgen.py
def load_index(self, persist_directory, index_name, store='chroma'):
    """
    load an already persisted index from a directory
    persist_directory: location of persisted index
    store: type of vectorstore to use (chroma, ...)
            only supports chroma for now
    """
    start_time = time.time()

    # make note of the store type
    self.store = store

    # load the index
    if self.store == 'chroma':
        self.vdb_client = chromadb.Client(Settings(
                                chroma_db_impl="duckdb+parquet",
                                persist_directory=persist_directory
                            ))
        self.index_name = index_name
        index = self.vdb_client.get_collection(name=self.index_name,
                                              embedding_function=self.embeddings.embed_documents)
    else:
        index = None

    # log that the index is ready
    duration = time.time() - start_time
    params = {
        "store": self.store,
        "persist_directory": persist_directory,
    }
    event = self._log_event(agent_events._EVNT_INDEXLOAD, duration, params=params)

    self.index = index

    return

load_spec(profilespec)

load the spec for generating the document profilespec: path to spec file

Source code in llmsdk/agents/docgen.py
def load_spec(self, profilespec):
    """
    load the spec for generating the document
    profilespec: path to spec file
    """
    # check path
    if isinstance(profilespec, dict):
        self.validate_spec(profilespec)
        return profilespec

    if os.path.exists(profilespec) and profilespec.endswith(".json"):
        with open(profilespec, "r") as fd:
            spec = json.load(fd)
            self.validate_spec(spec)
            return spec

    return None

prompt(prompt, mode='answer')

run a query on an index using an llm chain object prompt: prompt dict containing query: when mode=answer intent: when mode=summarize text: when mode=summarize index: index object llm: llm object mode: 'internal' for querying over docset, context: text used to guide the LLM when running in 'external' mode

Source code in llmsdk/agents/docgen.py
def prompt(self, prompt, mode="answer"):
    """
    run a query on an index using an llm chain object
    prompt: prompt dict containing
        query: when mode=answer
        intent: when mode=summarize
        text: when mode=summarize
    index: index object
    llm: llm object
    mode: 'internal' for querying over docset,
    context: text used to guide the LLM when running in 'external' mode
    """

    start_time = time.time()

    result = None
    if mode == 'answer':
        query = prompt.get('query')
        if query:
            result = self.run_query_answer(query)
    elif mode == 'collate':
        text = prompt.get('text')
        intent = prompt.get('intent', "one para")
        collation = prompt.get('collate', "summarize")
        if text:
            result = self.run_query_collation(intent=intent, text=text, method=collation)
    else:
        pass

    if result:
        answer = result['answer']
        # add keywords identified to the result
        result['keywords'] = self.run_query_kwords(context=answer)

    # log the event
    params = {
        "prompt": prompt,
        "mode": mode,
        "result": result.copy(),
    }
    duration = time.time() - start_time
    event = self._log_event(agent_events._EVNT_QUERY, duration, params=params)

    # add the event to the result
    result['metadata'] = {
        "timestamp": event['timestamp'],
        "duration": event['duration'],
    }

    return result

run_query_answer(query)

run a query using llm on an internal docset indexed in index this is useful when looking for answers using a private source of data

Source code in llmsdk/agents/docgen.py
def run_query_answer(self, query):
    """
    run a query using llm on an internal docset indexed in index
    this is useful when looking for answers using a private source of data
    """
    # get the similar docs
    docs = self.get_similar_docs(query)

    # setup the QnA chain object
    response = self.llm_chain({"input_documents":docs, "question":query},
                                return_only_outputs=True)

    # run the query against the similar docs
    result = {
        "question": query,
        "answer": response.get('output_text', self._err_msg('field')).strip(),
        "sources": [{"content": d.page_content, "source": d.metadata['source']} for d in docs]
    }

    # check if suggest call is needed
    if ('output_text' not in response) or ("i am not sure" in result['answer'].lower()):
        # we don't have a usable answer, so no need for sources
        result['sources'] = []

    return result

run_query_collation(intent, text, method)

run a query using llm to summarize multiple paras of text

Source code in llmsdk/agents/docgen.py
def run_query_collation(self, intent, text, method):
    """
    run a query using llm to summarize multiple paras of text
    """
    # augment the query with some context to guide the LLM
    prompt = self.generate_query_prompt_collation(text, intent, method)
    result = self.llm(prompt)
    result = result.strip()
    result = {
        "intent": intent,
        "text": text,
        "answer": result,
        "sources": [{"content": text, "source": f"doc-collation-{method}"}]
    }

    return result

run_query_kwords(context='')

run a query using llm on an internal docset indexed in index this is useful when looking for answers that generic llm can provide

Source code in llmsdk/agents/docgen.py
def run_query_kwords(self, context=""):
    """
    run a query using llm on an internal docset indexed in index
    this is useful when looking for answers that generic llm can provide
    """
    # augment the query with some context to guide the LLM
    query = self.generate_query_prompt_kwords(context)
    result = self.llm(query)
    result = result.strip()
    # a few tries to extract the response
    # sometimes, the LLM messes up
    try:
        result = json.loads(result)
    except:
        try:
            result = json.loads(f"[{result.split('[')[-1]}")
        except:
            pass

    return result

search_chromadb(query, k=7, include_metadata=False)

run a search against the chromadb index for a list of queries

Source code in llmsdk/agents/docgen.py
def search_chromadb(self, query, k=7, include_metadata=False):
    """
    run a search against the chromadb index for a list of queries
    """
    # perform query
    results = self.index.query(
                query_texts=[query],
                n_results=k,
                where=None,
                where_document=None)
    # construct result docset
    docs = []
    for i in range(0, len(results['documents'][0])):
        page_content = results['documents'][0][i]
        metadata = results['metadatas'][0][i]
        doc = Document(page_content=page_content, metadata=metadata)
        docs.append(doc)

    return docs