Installing required packages
“””
!pip install langchain
!pip install openai
!pip install docx2txt
“””Get an API key from https://openai.com/product “””
import os
os.environ[“OPENAI_API_KEY”] = “”
“””Subscribe for the free version https://www.pinecone.io/ and get an API Key
Importing necessary libraries
“””
!pip install pinecone-client
import os
import openai
import pinecone
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
“””Loading documents”””
file = ‘./data/mtd.docx’
loader = Docx2txtLoader(file)
documents = loader.load()
documents
“””Splitting documents”””
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
docs = split_docs(documents)
print(len(docs))
“””Embedding documents with OpenAI”””
!pip install tiktoken -q
embeddings = OpenAIEmbeddings()
query_result = embeddings.embed_query(“Hello world”)
len(query_result)
“””Vector search with Pinecone”””
pinecone.init(
api_key=””,
environment=””
)
index_name = “keba”
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
“”” Finding similar documents”””
def get_similiar_docs(query, k=2, score=False):
if score:
similar_docs = index.similarity_search_with_score(query, k=k)
else:
similar_docs = index.similarity_search(query, k=k)
return similar_docs
“””Question answering using LangChain and OpenAI LLM”””
model_name = “text-davinci-003”
model_name = “gpt-3.5-turbo”
model_name = “gpt-4”
llm = OpenAI(model_name=model_name)
chain = load_qa_chain(llm, chain_type=”stuff”)
def get_answer(query):
similar_docs = get_similiar_docs(query)
answer = chain.run(input_documents=similar_docs, question=query)
return answer
“””Example queries and answers”””
query = “In welchen Bereichen wird ausgebildet?”
answer = get_answer(query)
print(answer)