多模态 RAG 系统构建:图文检索增强生成
多模态 RAG 系统构建图文检索增强生成前言传统的 RAG 系统主要处理文本数据但现实世界中很多信息是多模态的包含图片、文档等。多模态 RAG 系统能够处理和理解多种类型的信息为用户提供更丰富的回答。我最近在项目中构建了一个多模态 RAG 系统可以处理用户上传的图片和文档。今天分享一些关键技术和实现经验。多模态 Embedding图文 Embeddingfrom transformers import CLIPProcessor, CLIPModel import torch class MultiModalEmbedding: 多模态 Embedding def __init__(self): self.device cuda if torch.cuda.is_available() else cpu self.processor CLIPProcessor.from_pretrained(openai/clip-vit-base-patch32) self.model CLIPModel.from_pretrained(openai/clip-vit-base-patch32).to(self.device) def embed_image(self, image_path: str) - list: 嵌入图片 from PIL import Image image Image.open(image_path) inputs self.processor( imagesimage, return_tensorspt ).to(self.device) with torch.no_grad(): outputs self.model.get_image_features(**inputs) return outputs[0].cpu().numpy().tolist() def embed_text(self, text: str) - list: 嵌入文本 inputs self.processor( texttext, return_tensorspt ).to(self.device) with torch.no_grad(): outputs self.model.get_text_features(**inputs) return outputs[0].cpu().numpy().tolist()文档 Embeddingfrom langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter class DocumentProcessor: 文档处理 def __init__(self, embedding_model): self.embedding_model embedding_model def process_pdf(self, pdf_path: str) - list: 处理 PDF 文档 loader PyPDFLoader(pdf_path) pages loader.load() text_splitter RecursiveCharacterTextSplitter( chunk_size500, chunk_overlap50 ) chunks text_splitter.split_documents(pages) # 为每个 chunk 生成 embedding documents [] for chunk in chunks: embedding self.embedding_model.embed_text(chunk.page_content) documents.append({ content: chunk.page_content, embedding: embedding, metadata: chunk.metadata }) return documents多模态向量数据库from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, PointStruct class MultimodalVectorDB: 多模态向量数据库 def __init__(self, host: str localhost, port: int 6333): self.client QdrantClient(hosthost, portport) def create_collection(self, collection_name: str, vector_size: int 512): 创建集合 self.client.create_collection( collection_namecollection_name, vectors_configVectorParams( sizevector_size, distanceCosine ) ) def add_documents(self, collection_name: str, documents: list): 添加文档 points [] for i, doc in enumerate(documents): points.append(PointStruct( idi, vectordoc[embedding], payload{ content: doc[content], type: doc.get(type, text), metadata: doc.get(metadata, {}) } )) self.client.upsert( collection_namecollection_name, pointspoints ) def search(self, collection_name: str, query_embedding: list, top_k: int 5) - list: 搜索 results self.client.search( collection_namecollection_name, query_vectorquery_embedding, limittop_k ) return [ { content: r.payload[content], score: r.score, type: r.payload[type] } for r in results ]多模态 RAG Pipelineclass MultimodalRAG: 多模态 RAG 系统 def __init__(self, llm, vector_db, embedding_model): self.llm llm self.vector_db vector_db self.embedding_model embedding_model def add_document(self, file_path: str): 添加文档 if file_path.endswith(.pdf): documents self._process_pdf(file_path) elif file_path.endswith((.jpg, .png)): documents self._process_image(file_path) else: documents self._process_text(file_path) self.vector_db.add_documents(multimodal, documents) def _process_pdf(self, pdf_path: str) - list: 处理 PDF processor DocumentProcessor(self.embedding_model) return processor.process_pdf(pdf_path) def _process_image(self, image_path: str) - list: 处理图片 embedding self.embedding_model.embed_image(image_path) return [{ content: f图片路径: {image_path}, embedding: embedding, type: image }] def _process_text(self, text_path: str) - list: 处理文本 with open(text_path, r) as f: content f.read() embedding self.embedding_model.embed_text(content) return [{ content: content, embedding: embedding, type: text }] def query(self, question: str) - str: 查询 # 1. 生成查询 embedding query_embedding self.embedding_model.embed_text(question) # 2. 搜索相关文档 results self.vector_db.search(multimodal, query_embedding) # 3. 构建上下文 context \n\n.join([ f【{r[type]}】{r[content]} for r in results ]) # 4. 生成回答 prompt f基于以下信息回答问题 {context} 问题{question} 回答 return self.llm.generate(prompt)实战示例# 初始化 embedding_model MultiModalEmbedding() vector_db MultimodalVectorDB() vector_db.create_collection(multimodal, vector_size512) # 添加文档 rag MultimodalRAG(llm, vector_db, embedding_model) rag.add_document(report.pdf) rag.add_document(diagram.png) rag.add_document(notes.txt) # 查询 result rag.query(报告中提到的关键数据是什么) print(result)总结多模态 RAG 系统扩展了传统 RAG 的能力图文 Embedding使用 CLIP 模型处理图片和文本多模态数据库统一存储和检索不同类型的内容智能查询能够根据文本问题找到相关的图片和文档关键要点使用 CLIP 进行跨模态检索统一的向量表示便于混合检索需要考虑不同模态的特性上下文构建需要适当处理不同类型的内容