diff --git a/run.py b/run.py index 9e0807d..04d65fb 100644 --- a/run.py +++ b/run.py @@ -4,4 +4,4 @@ app = create_app() if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8000) + uvicorn.run(app, host="0.0.0.0", port=8001) diff --git a/src/api/controllers/__init__.py b/src/api/controllers/__init__.py index 3393a26..2f18c19 100644 --- a/src/api/controllers/__init__.py +++ b/src/api/controllers/__init__.py @@ -1,4 +1,5 @@ from .chat import new_message as chat_new_message from .guardrails import Guardrail +from .controller_document import add_documents as add_documents -__all__ = ["chat_new_message", "Guardrail"] +__all__ = ["chat_new_message", "add_documents", "Guardrail"] diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py new file mode 100644 index 0000000..baa7742 --- /dev/null +++ b/src/api/controllers/controller_document.py @@ -0,0 +1,61 @@ +from src.infrastructure.database.chromadb.conector import ChromaDB +from src.services.docmuent_extration.extractor import DocumentFileReader +from typing import List, Dict, Optional + + +async def add_documents( + db: ChromaDB, + file_path: str, + collection_name: str, + metadata: Optional[str] = None, + documnet_id: Optional[List[str]] = None +) -> Dict: + + reader = DocumentFileReader(file_path, metadata, documnet_id) + result = await db.add_documents( + documents=reader.documents, + collection_name=collection_name, + metadatas=reader.metadata, + ids=reader.ids + ) + return result + +async def query_documents( + db: ChromaDB, + query_text: str, + collection_name: str, + n_results: int = 5, + where: Optional[dict] = None +): + result = await db.query_documents( + query_text=query_text, + collection_name=collection_name, + n_results=n_results, + where=where) + return result + +async def delete_documents( + db: ChromaDB, + ids: List[str], + collection_name: str +): + result = await db.delete_documents( + ids=ids, + collection_name=collection_name + ) + return result + +async def list_collections( + db: ChromaDB +): + result = await db.list_collections() + return result + +async def list_documents( + db: ChromaDB, + collection_name: str, +): + result = await db.list_documents( + collection_name=collection_name + ) + return result \ No newline at end of file diff --git a/src/api/models/__init__.py b/src/api/models/__init__.py index f19d09f..c8a1b37 100644 --- a/src/api/models/__init__.py +++ b/src/api/models/__init__.py @@ -1,3 +1,4 @@ from .api import APIResponse, APIRequest +from .documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest -__all__ = ["APIResponse", "APIRequest"] +__all__ = ["APIResponse", "APIRequest", "AddDocumentRequest", "QueryDocumentRequest", "DeleteDocumentRequest"] diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py new file mode 100644 index 0000000..c3af766 --- /dev/null +++ b/src/api/models/documents_request.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel +from typing import Optional, List, Dict + +class AddDocumentRequest(BaseModel): + file_path: str + collection_name: str + +class QueryDocumentRequest(BaseModel): + query_text: str + collection_name: str + n_results: Optional[int] = 5 + where: Optional[Dict] = None + +class DeleteDocumentRequest(BaseModel): + ids: List[str] + collection_name: str + +class ListDocumentsRequest(BaseModel): + collection_name: str + +class ListCollectionsRequest(BaseModel): + pass \ No newline at end of file diff --git a/src/api/routes/__init__.py b/src/api/routes/__init__.py new file mode 100644 index 0000000..b1efb50 --- /dev/null +++ b/src/api/routes/__init__.py @@ -0,0 +1,6 @@ +from src.api.routes.route_document import router as document_router +from src.api.routes.chat import router as chat_router + + +__all__ = ["chat_router", "document_router"] + diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py new file mode 100644 index 0000000..32321dd --- /dev/null +++ b/src/api/routes/route_document.py @@ -0,0 +1,99 @@ +from fastapi import APIRouter, status, Request, Depends, HTTPException +from src.api.models import APIResponse +from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, ListDocumentsRequest, ListCollectionsRequest, DeleteDocumentRequest +from typing import Optional, Dict +from src.api.controllers.controller_document import add_documents, query_documents, delete_documents, list_collections, list_documents + +router = APIRouter( + prefix="/documents", + tags=["documents"] +) + +@router.post("/add_documents", status_code=status.HTTP_200_OK) +async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> APIResponse: + try: + result = await add_documents( + req.app.vectordb, + doc_request.file_path, + doc_request.collection_name + ) + + return APIResponse( + status_code=200, + response=result + ) + + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) + +@router.post("/query_documents", status_code=status.HTTP_200_OK) +async def query_document_route(doc_request: QueryDocumentRequest, req: Request) -> APIResponse: + try: + result = await query_documents( + req.app.vectordb, + doc_request.query_text, + doc_request.collection_name, + doc_request.n_results, + doc_request.where + ) + return APIResponse( + status_code=200, + response=result + ) + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) + +@router.get("/list_collections", status_code=status.HTTP_200_OK) +async def list_collections_route(req: Request): + try: + result = await list_collections(req.app.vectordb) + return { + "status_code": 200, + "response": result # Retorna diretamente a lista de nomes das coleções + } + except Exception as e: + return { + "status_code": 500, + "status_message": f"Erro detalhado: {str(e)}" + } + +@router.post("/list_documents", status_code=status.HTTP_200_OK) +async def list_documents_route(doc_request: ListDocumentsRequest, req: Request) -> APIResponse: + try: + result = await list_documents( + req.app.vectordb, + doc_request.collection_name + ) + return APIResponse( + status_code=200, + response=result + ) + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) + +@router.delete("/delete_documents", status_code=status.HTTP_200_OK) +async def delete_document_route(doc_request: DeleteDocumentRequest, req: Request) -> APIResponse: + try: + result = await delete_documents( + req.app.vectordb, + doc_request.ids, + doc_request.collection_name + ) + return APIResponse( + status_code=200, + response=result + ) + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) \ No newline at end of file diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py new file mode 100644 index 0000000..adc20a0 --- /dev/null +++ b/src/infrastructure/database/chromadb/conector.py @@ -0,0 +1,125 @@ +import chromadb +from chromadb.config import Settings +from typing import List, Optional + + +class ChromaDB: + """Manager for ChromaDB connection and operations.""" + + def __init__(self): + """Initialize ChromaDB connection.""" + self.host = 'localhost' + self.port = 8000 + self.client = self._connect() + self.collection = None + + def _connect(self): + """Connect to ChromaDB.""" + client = chromadb.HttpClient(host=self.host, port=self.port) + return client + + def _get_or_create_collection(self, collection_name: str): + """Create or get an existing collection. + + Args: + collection_name (str): Name of the collection to create/get + + Returns: + Collection: Created/retrieved collection object + """ + self.collection = self.client.get_or_create_collection( + name=collection_name + ) + return self.collection + + async def add_documents( + self, + documents: List[str], + collection_name: str, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None + ): + """Add documents to the collection. + + Args: + documents (List[str]): List of document texts + metadatas (Optional[List[dict]]): Document metadata + ids (Optional[List[str]]): Unique document IDs + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Result of add operation + """ + + self.collection = self._get_or_create_collection(collection_name) + return self.collection.add( + documents=documents, + metadatas=metadatas, + ids=ids + ) + + async def query_documents( + self, + query_text: str, + collection_name: str, + n_results: int = 5, + where: Optional[dict] = None + ): + """Query similar documents in the collection. + + Args: + query_text (str): Text to search for similarity + n_results (int): Number of desired results + where (Optional[dict]): Additional filters + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Query results + """ + + self.collection = self._get_or_create_collection(collection_name) + return self.collection.query( + query_texts=[query_text], + n_results=n_results, + where=where + ) + + async def list_collections(self): + """List all collections in ChromaDB. + + Returns: + dict: List of collections + """ + return self.client.list_collections() + + async def list_documents(self, collection_name: str): + """List all documents in a + collection. Args: collection_name (str): Name of the collection to list documents from Returns: dict: List of documents in the collection""" + + self.collection = self._get_or_create_collection(collection_name) + return self.collection.get(include=["documents", "metadatas"]) + + async def delete_documents(self, ids: List[str],collection_name: str): + """Delete documents from collection by IDs. + + Args: + ids (List[str]): List of document IDs to delete + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Result of delete operation + """ + + self.collection = self._get_or_create_collection(collection_name) + return self.collection.delete(ids=ids) + + def __del__(self): + """Close ChromaDB connection and clean up resources.""" + if self.client: + self.client.reset() \ No newline at end of file diff --git a/src/main.py b/src/main.py index 1cd2aaa..764888d 100644 --- a/src/main.py +++ b/src/main.py @@ -1,21 +1,23 @@ from fastapi import FastAPI - -from src.infrastructure.database import MongoDB +from src.infrastructure.database.chromadb.conector import ChromaDB +from src.infrastructure.database.mongodb.connector import MongoDB from src.infrastructure.config.llm import LLM from src.services.llama_guard import LlamaGuard -from src.api.routes import chat_router +from src.api.routes import chat_router, document_router def create_app(): app = FastAPI() # defining API variables + app.vectordb = ChromaDB() app.database = MongoDB() app.llm = LLM() app.llama_guard = LlamaGuard() # including routes app.include_router(chat_router) + app.include_router(document_router) return app diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py new file mode 100644 index 0000000..77ba17f --- /dev/null +++ b/src/services/docmuent_extration/extractor.py @@ -0,0 +1,201 @@ +import json +from typing import List, Dict, Tuple, Optional +from pathlib import Path +import uuid +from docx import Document +from PyPDF2 import PdfReader +import re +from dataclasses import dataclass + +@dataclass +class DocumentResult: + """ + Classe para armazenar o resultado da leitura de documentos. + + Attributes: + documents (List[str]): Lista de documentos processados + ids (List[str]): Lista de identificadores únicos + metadata (List[Dict]): Lista de metadados associados + """ + documents: List[str] + ids: List[str] + metadata: List[Dict] + +class DocumentFileReader: + """ + Classe para leitura e processamento de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT). + + Esta classe fornece uma interface unificada para ler diferentes formatos de arquivo, + processando-os em documentos individuais e gerando identificadores únicos e metadados + para cada documento. + + Attributes: + supported_extensions (set): Extensões de arquivo suportadas (.json, .pdf, .docx, .txt) + documents (list): Lista dos documentos processados + ids (list): Lista de identificadores únicos para cada documento + metadata (list): Lista de metadados associados a cada documento + + Example: + >>> reader = Reader("documento.pdf", "noticia") + >>> print(len(reader.documents)) # número de documentos + >>> print(reader.ids) # lista de IDs gerados + >>> print(reader.metadata) # metadados dos documentos + + Note: + - Arquivos PDF e DOCX são tratados como um único documento + - Arquivos JSON podem conter múltiplos documentos + - Arquivos TXT são tratados como um único documento + """ + + def __init__(self, file_path: str, metadata: str, documnet_id: Optional[str] = None): + """ + Inicializa o leitor de documentos. + + Args: + file_path (str): Caminho do arquivo a ser lido + metadata (str): Tipo de conteúdo do documento (ex: "noticia", "artigo") + + Raises: + FileNotFoundError: Se o arquivo não for encontrado + ValueError: Se a extensão do arquivo não for suportada + """ + self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} + result = self.__call__(file_path, metadata, documnet_id) + self.documents = result.documents + self.ids = result.ids + self.metadata = result.metadata + + def __call__(self, file_path: str, metadata: Optional[str] = None, documnet_id: Optional[str] = None) -> DocumentResult: + """ + Processa o arquivo e retorna os resultados estruturados. + + Args: + file_path (str): Caminho do arquivo a ser lido + metadata (str): Tipo de conteúdo do documento + + Returns: + DocumentResult: Objeto contendo documentos, IDs e metadados + + Raises: + FileNotFoundError: Se o arquivo não for encontrado + ValueError: Se a extensão do arquivo não for suportada + """ + path = Path(file_path) + + if not path.exists(): + raise FileNotFoundError(f"Arquivo não encontrado: {file_path}") + + if path.suffix.lower() not in self.supported_extensions: + raise ValueError(f"Extensão não suportada: {path.suffix}") + + readers = { + '.json': self._read_json, + '.pdf': self._read_pdf, + '.docx': self._read_docx, + '.txt': self._read_txt + } + + reader = readers.get(path.suffix.lower()) + documents = reader(file_path) + + # Gerando IDs únicos + if documnet_id is None: + ids=[str(uuid.uuid4()) for _ in enumerate(documents)] + else: + ids = [documnet_id] + + # Retornando o DocumentResult + return DocumentResult( + documents=documents, + ids=ids, + metadata=metadata + ) + + def _read_json(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos JSON. + + Args: + file_path (str): Caminho do arquivo JSON + + Returns: + List[str]: Lista de documentos extraídos do JSON + + Notes: + - Espera-se que o JSON contenha uma chave 'texto' em cada item + - Pode processar tanto JSON único quanto lista de JSONs + """ + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + if isinstance(data, list): + documents = [item.get('texto', '') for item in data] + else: + documents = [data.get('texto', '')] + return documents + + def _read_pdf(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos PDF. + + Args: + file_path (str): Caminho do arquivo PDF + + Returns: + List[str]: Lista contendo o texto completo do PDF + + Notes: + - Todo o conteúdo do PDF é combinado em um único documento + - Páginas vazias são ignoradas + """ + reader = PdfReader(file_path) + + # Combina todo o texto do PDF em um único documento + full_text = " ".join(page.extract_text() or "" for page in reader.pages).strip() + + # Cria uma lista com um único documento + documents = [full_text] + + return documents + + def _read_docx(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos DOCX. + + Args: + file_path (str): Caminho do arquivo DOCX + + Returns: + List[str]: Lista contendo o texto completo do documento + + Notes: + - Parágrafos vazios são ignorados + - Todo o conteúdo é combinado em um único documento + """ + doc = Document(file_path) + # Combina todos os parágrafos em um único texto, separando por espaços + full_text = " ".join(paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()) + # Retorna uma lista com um único documento + documents = [full_text] + + return documents + + def _read_txt(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos de texto. + + Args: + file_path (str): Caminho do arquivo TXT + + Returns: + List[str]: Lista contendo o texto completo do arquivo + + Notes: + - O arquivo inteiro é tratado como um único documento + """ + with open(file_path, 'r', encoding='utf-8') as file: + text = file.read() + documents = [text] + + return documents + \ No newline at end of file