trying to use RetrievalQA with Chromadb to create a Q&A bot on our company's documents. general setup as below:
import libs
from langchain.vectorstores.chroma import Chroma # for storing and retrieving vectors
from langchain.embeddings.openai import OpenAIEmbeddings # for embedding text
from langchain.text_splitter import CharacterTextSplitter # for splitting text into tokens
from langchain import OpenAI # for using the OpenAI API
from langchain.chains import RetrievalQA # for question and answer retrieval
from langchain.document_loaders import DirectoryLoader # for loading documents from a directory
from langchain.llms import OpenAI
import magic
import os
import nltk
loading docs, chunking, embedding
loader = DirectoryLoader('dir',glob='**/*.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap = 100)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
persist_directory = 'db'
docsearch = Chroma.from_documents(
texts,
embeddings,
persist_directory=persist_directory
then start the chain and ask question
llm = OpenAI(temperature=0.1, model_name='gpt-3.5-turbo', cache=False,verbose=True)
chain_type_kwargs = {"prompt": PROMPT_1}
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs = {'filter': {'source':'DB_Manual.txt'}}), chain_type_kwargs=chain_type_kwargs)
query = "what's our company's dress code?"
result = qa.run(query)
result
with multiple doc in db, the search_kwargs doesn't seem to work consistently.
i tried to ask about dress code while filtering to a manual (which doesn't have anything to do with dress code), expecting it to give me idk as the answer, but sometimes it would still give me the correct answer about dress code.
i feel like it's because of the metadata, maybe by default chroma doesnt include 'source' as metadata and i need to load the docs in with 'source' metadata defined explicitly? if so does anyone know how to do that through langchain?
The below is working for me, with Lanchain version 0.0.223.
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import AzureOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
load_dotenv('../../.env')
document_directory = '../../data'
embedding_function = OpenAIEmbeddings(
openai_api_key=os.getenv("OPENAI_API_KEY"),
deployment=os.getenv('EMBEDDING_DEPLOYMENT_NAME'),
model=os.getenv('EMBEDDING_MODEL'),
chunk_size=1)
loader = DirectoryLoader(document_directory)
documents = loader.load()
db = Chroma.from_documents(documents, embedding_function)
llm = AzureOpenAI(deployment_name=os.getenv('CHAT_DEPLOYMENT_NAME'),
model_name=os.getenv('CHAT_MODEL'),
temperature=0, openai_api_version='2023-05-15')
vec = db.as_retriever(search_kwargs={"filter": {"source":'..\\..\\data\\musk-article-1.txt'}})
qa = ConversationalRetrievalChain.from_llm(llm = llm,
retriever = vec,
return_source_documents = True)
response = qa({"question": "Who is William Rich?", "chat_history": []})
print(response['answer'])
I have two articles. The first article specifically mentions William Rich, and the second does not.
When I run this with:
vec = db.as_retriever(search_kwargs={"filter": {"source":'..\\..\\data\\musk-article-1.txt'}})
I get:
William Rich is an Employee
But when I change the source document to:
vec = db.as_retriever(search_kwargs={"filter": {"source":'..\\..\\data\\musk-article-2.txt'}})
I get:
"William Rich is not mentioned in the context"
Which proves that the source filtering is working, since William Rich is not mentioned in musk-article-2.txt.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With