Table of Content
RAG+LlamaParse: Leading a new era of PDF parsing and retrieval!

Updated on:July-17th-2025
Recommendation
**Explore the future of PDF parsing and retrieval, how the combination of RAG and LlamaParse will change the way information is processed. **
Core content:
1. The working principle of RAG technology and its key role in data-driven generative AI
2. The challenges of information extraction from PDF files and the advantages of LlamaParse technology
3. The application prospects of LlamaParse in processing complex documents containing tables, images, etc.
Yang Fangxian
Founder of 53AI/Most Valuable Expert of Tencent Cloud (TVP)
!pip install llama-index!pip install llama-index-core!pip install llama-index-embeddings-openai!pip install llama-parse!pip install llama-index-vector-stores-kdbai!pip install pandas!pip install llama-index-postprocessor-cohere-rerank!pip install kdbai_client
from llama_parse import LlamaParsefrom llama_index.core import Settingsfrom llama_index.core import StorageContextfrom llama_index.core import VectorStoreIndexfrom llama_index.core.node_parser import MarkdownElementNodeParser from llama_index.llms.openai import OpenAIfrom llama_index.embeddings.openai import OpenAIEmbeddingfrom llama_index.vector_stores.kdbai import KDBAIVectorStorefrom llama_index.postprocessor.cohere_rerank import CohereRerankfrom getpass import getpassimport osimport kdbai_client as kdbai
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncioimport nest_asyncionest_asyncio.apply()
# API access to llama-cloudos.environ["LLAMA_CLOUD_API_KEY"] = ( os.environ["LLAMA_CLOUD_API_KEY"] if "LLAMA_CLOUD_API_KEY" in os.environ else getpass("LLAMA CLOUD API key: "))
# Using OpenAI API for embeddings/llmsos.environ["OPENAI_API_KEY"] = ( os.environ["OPENAI_API_KEY"] if "OPENAI_API_KEY" in os.environ else getpass("OpenAI API Key: "))
#Set up KDB.AI endpoint and API keyKDBAI_ENDPOINT = ( os.environ["KDBAI_ENDPOINT"] if "KDBAI_ENDPOINT" in os.environ else input("KDB.AI endpoint: "))KDBAI_API_KEY = ( os.environ["KDBAI_API_KEY"] if "KDBAI_API_KEY" in os.environ else getpass("KDB.AI API key: "))
#connect to KDB.AIsession = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)
schema = [
dict (name= "document_id" , type = "str" ),
dict (name= "text" , type = "str" ),
dict (name= "embeddings" , type = "float32s" ),
]
indexFlat = {
"name" : "flat" ,
"type" : "flat" ,
"column" : "embeddings" ,
"params" : { 'dims' : 1536 , 'metric' : 'L2' },
}
# Connect with kdbai database
db = session.database("default")
KDBAI_TABLE_NAME = "LlamaParse_Table"
# First ensure the table does not already exist
try:
db.table(KDBAI_TABLE_NAME).drop()
except kdbai.KDBAIException:
pass
#Create the table
table = db.create_table(KDBAI_TABLE_NAME, schema, indexes=[indexFlat])
!wget 'https://arxiv.org/pdf/2404.08865' -O './LLM_recall.pdf'
EMBEDDING_MODEL = "text-embedding-3-small"
GENERATION_MODEL = "gpt-4o"
llm = OpenAI(model=GENERATION_MODEL)
embed_model = OpenAIEmbedding(model=EMBEDDING_MODEL)
Settings.llm = llm
Settings.embed_model = embed_model
pdf_file_name = './LLM_recall.pdf'
parsing_instructions = '''The document titled "LLM In-Context Recall is Prompt Dependent" is an academic preprint from April 2024, authored by Daniel Machlab and Rick Battle from the VMware NLP Lab. It explores the in-context recall capabilities of Large Language Models (LLMs) using a method called "needle-in-a-haystack," where a specific factoid is embedded in a block of unrelated text. The study investigates how the recall performance of various LLMs is influenced by the content of prompts and the biases in their training data. The research involves testing multiple LLMs with varying context window sizes to assess their ability to recall information accurately when prompted differently. The paper includes detailed methodologies, results from numerous tests, discussions on the impact of prompt variations and training data, and conclusions on improving LLM utility in practical applications. It contains many tables. Answer questions using the information in this article and be precise.'''
documents = LlamaParse(result_type="markdown", parsing_instructions=parsing_instructions).load_data(pdf_file_name)
print(documents[0].text[:1000])
# Parse the documents using MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8).from_defaults()
# Retrieve nodes (text) and objects (table)
nodes = node_parser.get_nodes_from_documents(documents)
from openai import OpenAIclient = OpenAI()def embed_query(query): query_embedding = client.embeddings.create( input=query, model="text-embedding-3-small" ) return query_embedding.data[0].embeddingdef retrieve_data(query): query_embedding = embed_query(query) results = table.search(vectors={'flat':[query_embedding]},n=5,filter=[('<>','document_id','4a9551df-5dec-4410-90bb-43d17d722918')]) retrieved_data_for_RAG = [] for index, row in results[0].iterrows(): retrieved_data_for_RAG.append(row['text']) return retrieved_data_for_RAGdef RAG(query): question = "You will answer this question based on the provided reference material: " + query messages = "Here is the provided context: " + "\n" results = retrieve_data(query) if results: for data in results: messages += data + "\n" response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": question}, { "role": "user", "content": [ {"type": "text", "text": messages}, ], } ], max_tokens=300, ) content = response.choices[0].message.content return content