Source code for docsense

# Copyright (c) 2024 Harim Kang
# SPDX-License-Identifier: MIT

"""
DocSense: An intelligent document assistant powered by Qwen.
"""

__version__ = "0.1.0"

import os
from pathlib import Path
from typing import Any, Dict, Optional

from .indexer import DocumentLoader, VectorStore
from .models import EmbeddingModel, LLMModel

DEFAULT_INDEX_PATH = Path(os.path.expanduser("~/.docsense/index"))


[docs] class DocSense: """Main class for document processing and question answering.""" _instance = None
[docs] @classmethod def get_instance(cls, **kwargs): if cls._instance is None: cls._instance = cls(**kwargs) return cls._instance
[docs] def __init__( self, model_name: str = "Qwen/Qwen2-7B", embedding_model: Optional[str] = None, device: str = "cuda", index_path: Optional[str] = None, use_gpu_faiss: bool = True, ): """ Initialize DocSense instance. Args: model_name: Name of the Qwen model to use embedding_model: Name of the embedding model (defaults to model_name if None) device: Device to run the model on ('cuda' or 'cpu') index_path: Path to store/load the vector index (defaults to ~/.docsense/index) use_gpu_faiss: Whether to use GPU for FAISS operations """ self.model_name = model_name self.embedding_model = embedding_model or model_name self.device = device self.index_path = Path(index_path) if index_path else Path(DEFAULT_INDEX_PATH) # Create index directory if it doesn't exist self.index_path.parent.mkdir(parents=True, exist_ok=True) # Initialize models self._init_models() # Initialize document processing self.doc_loader = DocumentLoader() # Initialize vector store self._vector_store = VectorStore( dimension=self._embedding_model.get_embedding_dim(), index_path=str(self.index_path), use_gpu=use_gpu_faiss, ) self._response_cache: dict[str, Any] = {} # Simple cache for responses
def _init_models(self): """Initialize the models.""" # Initialize embedding model self._embedding_model = EmbeddingModel(model_name=self.embedding_model, device=self.device) # Initialize LLM self._llm = LLMModel(model_name=self.model_name, device=self.device)
[docs] def index_documents(self, doc_path: str) -> None: """ Index documents from the specified path. Args: doc_path: Path to the documents directory Raises: ValueError: If no documents are found in the specified path Exception: If there are errors during embedding generation or vector store operations """ try: print(f"\nStarting document indexing from: {doc_path}") # Load documents as iterator document_iterator = self.doc_loader.load_directory(doc_path) # Convert to list only when needed for length check and processing documents = list(document_iterator) print(f"\nFound {len(documents)} documents") if not documents: print(f"No documents were found in {doc_path}") raise ValueError(f"No documents found in {doc_path}") print(f"\nProcessing {len(documents)} documents...") # Generate embeddings (using generator for memory efficiency) texts = (doc.content for doc in documents) texts_list = list(texts) print(f"Generating embeddings for {len(texts_list)} texts...") try: embeddings = self._embedding_model.encode(texts_list) print(f"Generated embeddings shape: {embeddings.shape}") except Exception as e: print(f"Error generating embeddings: {str(e)}") raise # Add to vector store try: self._vector_store.add_documents(documents, embeddings) print("Successfully added documents to vector store") except Exception as e: print(f"Error adding to vector store: {str(e)}") raise except Exception as e: print(f"\nError during document indexing: {str(e)}") print(f"Type: {type(e).__name__}") raise
[docs] def ask(self, question: str) -> Dict[str, Any]: """ Answer a question based on the indexed documents. Args: question: User question Returns: Dict containing: - answer: Generated response to the question - sources: List of relevant source documents with metadata - metadata: Additional information about prompt and generation config Raises: RuntimeError: If no documents have been indexed yet """ # Normalize question for cache key cache_key = question.strip().lower() # Check cache if cache_key in self._response_cache: return self._response_cache[cache_key] # Generate new response if not self._vector_store: raise RuntimeError("No documents have been indexed yet") # Generate question embedding question_embedding = self._embedding_model.encode(question) # Search for relevant documents relevant_docs = self._vector_store.search(question_embedding) if not relevant_docs: response = { "answer": "I couldn't find any relevant information to answer your question.", "sources": [], "metadata": {}, } else: # Prepare context and sources context = [] sources = [] for doc, score in relevant_docs: context.append(doc.content) sources.append( { "path": doc.metadata.get("source", "Unknown"), "type": doc.metadata.get("type", "Unknown"), "relevance_score": float(score), } ) # Generate answer llm_response = self._llm.generate(question, context=context) response = { "answer": llm_response["answer"], "sources": sources, "metadata": {"prompt": llm_response["prompt"], "generation_config": llm_response["generation_config"]}, } # Cache response self._response_cache[cache_key] = response return response
# Convenience functions
[docs] def get_docsense(**kwargs) -> DocSense: return DocSense.get_instance(**kwargs)
[docs] def create_index(doc_path: str, **kwargs) -> None: ds = get_docsense(**kwargs) ds.index_documents(doc_path)
[docs] def ask_question(question: str, **kwargs) -> Dict[str, Any]: ds = get_docsense(**kwargs) return ds.ask(question)