Source code for docsense

# Copyright (c) 2024 Harim Kang
# SPDX-License-Identifier: MIT

"""
DocSense: An intelligent document assistant powered by Qwen.
"""

__version__ = "0.1.0"

import os
from pathlib import Path
from typing import Any, Dict, Optional

from .indexer import DocumentLoader, VectorStore
from .models import EmbeddingModel, LLMModel

DEFAULT_INDEX_PATH = Path(os.path.expanduser("~/.docsense/index"))



[docs]
class DocSense:
    """Main class for document processing and question answering."""

    _instance = None


[docs]
    @classmethod
    def get_instance(cls, **kwargs):
        if cls._instance is None:
            cls._instance = cls(**kwargs)
        return cls._instance



[docs]
    def __init__(
        self,
        model_name: str = "Qwen/Qwen2-7B",
        embedding_model: Optional[str] = None,
        device: str = "cuda",
        index_path: Optional[str] = None,
        use_gpu_faiss: bool = True,
    ):
        """
        Initialize DocSense instance.

        Args:
            model_name: Name of the Qwen model to use
            embedding_model: Name of the embedding model (defaults to model_name if None)
            device: Device to run the model on ('cuda' or 'cpu')
            index_path: Path to store/load the vector index (defaults to ~/.docsense/index)
            use_gpu_faiss: Whether to use GPU for FAISS operations
        """
        self.model_name = model_name
        self.embedding_model = embedding_model or model_name
        self.device = device
        self.index_path = Path(index_path) if index_path else Path(DEFAULT_INDEX_PATH)

        # Create index directory if it doesn't exist
        self.index_path.parent.mkdir(parents=True, exist_ok=True)

        # Initialize models
        self._init_models()

        # Initialize document processing
        self.doc_loader = DocumentLoader()

        # Initialize vector store
        self._vector_store = VectorStore(
            dimension=self._embedding_model.get_embedding_dim(),
            index_path=str(self.index_path),
            use_gpu=use_gpu_faiss,
        )

        self._response_cache: dict[str, Any] = {}  # Simple cache for responses


    def _init_models(self):
        """Initialize the models."""
        # Initialize embedding model
        self._embedding_model = EmbeddingModel(model_name=self.embedding_model, device=self.device)

        # Initialize LLM
        self._llm = LLMModel(model_name=self.model_name, device=self.device)


[docs]
    def index_documents(self, doc_path: str) -> None:
        """
        Index documents from the specified path.

        Args:
            doc_path: Path to the documents directory

        Raises:
            ValueError: If no documents are found in the specified path
            Exception: If there are errors during embedding generation or vector store operations
        """
        try:
            print(f"\nStarting document indexing from: {doc_path}")

            # Load documents as iterator
            document_iterator = self.doc_loader.load_directory(doc_path)

            # Convert to list only when needed for length check and processing
            documents = list(document_iterator)

            print(f"\nFound {len(documents)} documents")
            if not documents:
                print(f"No documents were found in {doc_path}")
                raise ValueError(f"No documents found in {doc_path}")

            print(f"\nProcessing {len(documents)} documents...")

            # Generate embeddings (using generator for memory efficiency)
            texts = (doc.content for doc in documents)
            texts_list = list(texts)
            print(f"Generating embeddings for {len(texts_list)} texts...")

            try:
                embeddings = self._embedding_model.encode(texts_list)
                print(f"Generated embeddings shape: {embeddings.shape}")
            except Exception as e:
                print(f"Error generating embeddings: {str(e)}")
                raise

            # Add to vector store
            try:
                self._vector_store.add_documents(documents, embeddings)
                print("Successfully added documents to vector store")
            except Exception as e:
                print(f"Error adding to vector store: {str(e)}")
                raise

        except Exception as e:
            print(f"\nError during document indexing: {str(e)}")
            print(f"Type: {type(e).__name__}")
            raise



[docs]
    def ask(self, question: str) -> Dict[str, Any]:
        """
        Answer a question based on the indexed documents.

        Args:
            question: User question

        Returns:
            Dict containing:
                - answer: Generated response to the question
                - sources: List of relevant source documents with metadata
                - metadata: Additional information about prompt and generation config

        Raises:
            RuntimeError: If no documents have been indexed yet
        """

        # Normalize question for cache key
        cache_key = question.strip().lower()

        # Check cache
        if cache_key in self._response_cache:
            return self._response_cache[cache_key]

        # Generate new response
        if not self._vector_store:
            raise RuntimeError("No documents have been indexed yet")

        # Generate question embedding
        question_embedding = self._embedding_model.encode(question)

        # Search for relevant documents
        relevant_docs = self._vector_store.search(question_embedding)

        if not relevant_docs:
            response = {
                "answer": "I couldn't find any relevant information to answer your question.",
                "sources": [],
                "metadata": {},
            }
        else:
            # Prepare context and sources
            context = []
            sources = []
            for doc, score in relevant_docs:
                context.append(doc.content)
                sources.append(
                    {
                        "path": doc.metadata.get("source", "Unknown"),
                        "type": doc.metadata.get("type", "Unknown"),
                        "relevance_score": float(score),
                    }
                )

            # Generate answer
            llm_response = self._llm.generate(question, context=context)
            response = {
                "answer": llm_response["answer"],
                "sources": sources,
                "metadata": {"prompt": llm_response["prompt"], "generation_config": llm_response["generation_config"]},
            }

        # Cache response
        self._response_cache[cache_key] = response
        return response




# Convenience functions

[docs]
def get_docsense(**kwargs) -> DocSense:
    return DocSense.get_instance(**kwargs)




[docs]
def create_index(doc_path: str, **kwargs) -> None:
    ds = get_docsense(**kwargs)
    ds.index_documents(doc_path)




[docs]
def ask_question(question: str, **kwargs) -> Dict[str, Any]:
    ds = get_docsense(**kwargs)
    return ds.ask(question)