From e795e6d457ed44011da99ccc187e9871a573dbaa Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 12 Feb 2025 18:54:53 -0300 Subject: [PATCH 01/15] [feat] add chroma conector --- .../database/chromadb/conector.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 src/infrastructure/database/chromadb/conector.py diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py new file mode 100644 index 0000000..155ff43 --- /dev/null +++ b/src/infrastructure/database/chromadb/conector.py @@ -0,0 +1,110 @@ +import chromadb +from chromadb.config import Settings +from typing import List, Optional + + +class ChromaDB: + """Manager for ChromaDB connection and operations.""" + + def __init__(self): + """Initialize ChromaDB connection.""" + self.host = 'localhost' + self.port = 8000 + self.client = self._connect() + self.collection = None + + def _connect(self): + """Connect to ChromaDB.""" + client = chromadb.HttpClient(host=self.host, port=self.port) + return client + + def _create_collection(self, collection_name: str): + """Create or get an existing collection. + + Args: + collection_name (str): Name of the collection to create/get + + Returns: + Collection: Created/retrieved collection object + """ + self.collection = self.client.get_or_create_collection( + name=collection_name + ) + return self.collection + + async def add_documents( + self, + documents: List[str], + collection_name: str, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None + ): + """Add documents to the collection. + + Args: + documents (List[str]): List of document texts + metadatas (Optional[List[dict]]): Document metadata + ids (Optional[List[str]]): Unique document IDs + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Result of add operation + """ + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.add( + documents=documents, + metadatas=metadatas, + ids=ids + ) + + async def query_documents( + self, + query_text: str, + collection_name: str, + n_results: int = 5, + where: Optional[dict] = None + ): + """Query similar documents in the collection. + + Args: + query_text (str): Text to search for similarity + n_results (int): Number of desired results + where (Optional[dict]): Additional filters + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Query results + """ + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.query( + query_texts=[query_text], + n_results=n_results, + where=where + ) + + async def delete_documents(self, ids: List[str],collection_name: str): + """Delete documents from collection by IDs. + + Args: + ids (List[str]): List of document IDs to delete + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Result of delete operation + """ + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.delete(ids=ids) + + async def close(self): + """Close ChromaDB connection and clean up resources.""" + if self.client: + self.client.reset() \ No newline at end of file From 1de190d17cb06d4f493af60260fc540d3c2c8cd5 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Thu, 13 Feb 2025 18:00:52 -0300 Subject: [PATCH 02/15] [feat] file reader --- src/services/docmuent_extration/extractor.py | 98 ++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 src/services/docmuent_extration/extractor.py diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py new file mode 100644 index 0000000..3c81712 --- /dev/null +++ b/src/services/docmuent_extration/extractor.py @@ -0,0 +1,98 @@ +import json +from typing import List, Dict, Tuple, Optional +from pathlib import Path +from docx import Document +from PyPDF2 import PdfReader +import re + +class Reader: + """ + Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT). + """ + + def __init__(self): + """Inicializa o leitor de documentos.""" + self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} + + def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]: + """ + Lê um arquivo e retorna seus documentos, IDs e metadados. + + Args: + file_path (str): Caminho para o arquivo + document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato') + + Returns: + Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados) + """ + path = Path(file_path) + + if not path.exists(): + raise FileNotFoundError(f"Arquivo não encontrado: {file_path}") + + if path.suffix.lower() not in self.supported_extensions: + raise ValueError(f"Extensão não suportada: {path.suffix}") + + # Seleciona o método apropriado baseado na extensão + readers = { + '.json': self._read_json, + '.pdf': self._read_pdf, + '.docx': self._read_docx, + '.txt': self._read_txt + } + + reader = readers.get(path.suffix.lower()) + documents, content = reader(file_path) + + # Adiciona os documentos ao content para uso no _generate_metadata + content['documents'] = documents + + # Gera IDs e metadados + ids = [f"{document_content}_{i+1}" for i in range(len(documents))] + metadata = self._generate_metadata(content, document_content, path.name) + + return documents, ids, metadata + + def _read_json(self, file_path: str) -> Tuple[List[str], Dict]: + """Lê arquivo JSON.""" + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + if isinstance(data, list): + documents = [item.get('texto', '') for item in data] + content = { + 'titulos': [item.get('titulo', '') for item in data], + 'subtitulos': [item.get('subtitulo', '') for item in data], + 'datas': [item.get('data', '') for item in data] + } + else: + documents = [data.get('texto', '')] + content = { + 'titulos': [data.get('titulo', '')], + 'subtitulos': [data.get('subtitulo', '')], + 'datas': [data.get('data', '')] + } + + return documents, content + + def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]: + """Lê arquivo PDF como um único documento.""" + reader = PdfReader(file_path) + + # Combina todo o texto do PDF em um único documento + full_text = " ".join(page.extract_text() or "" for page in reader.pages).strip() + + # Cria uma lista com um único documento + documents = [full_text] + + # Metadata com informações do PDF + content = { + 'total_pages': len(reader.pages), + 'file_name': Path(file_path).name, + 'file_type': 'pdf', + 'file_size': Path(file_path).stat().st_size, # tamanho em bytes + 'created_at': str(Path(file_path).stat().st_ctime), # data de criação + 'modified_at': str(Path(file_path).stat().st_mtime) # data de modificação + } + + return documents, content \ No newline at end of file From 3634b668ad5be93777342643d147870559913f9d Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Fri, 14 Feb 2025 17:07:28 -0300 Subject: [PATCH 03/15] [feat] uploads in file reader --- src/services/docmuent_extration/extractor.py | 99 +++++++++++--------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py index 3c81712..0cbed06 100644 --- a/src/services/docmuent_extration/extractor.py +++ b/src/services/docmuent_extration/extractor.py @@ -7,24 +7,42 @@ class Reader: """ - Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT). + Classe para leitura e processamento de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT). + + Esta classe fornece uma interface unificada para ler diferentes formatos de arquivo, + processando-os em documentos individuais e gerando identificadores únicos e metadados + para cada documento. + + Attributes: + supported_extensions (set): Extensões de arquivo suportadas (.json, .pdf, .docx, .txt) + documents (list): Lista dos documentos processados + ids (list): Lista de identificadores únicos para cada documento + metadata (list): Lista de metadados associados a cada documento + + Example: + >>> reader = Reader("documento.pdf", "noticia") + >>> print(len(reader.documents)) # número de documentos + >>> print(reader.ids) # lista de IDs gerados + >>> print(reader.metadata) # metadados dos documentos + + Note: + - Arquivos PDF e DOCX são tratados como um único documento + - Arquivos JSON podem conter múltiplos documentos + - Arquivos TXT são tratados como um único documento """ - def __init__(self): - """Inicializa o leitor de documentos.""" + def __init__(self, file_path: str, document_content: str): + self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} + # Inicializa os atributos + self.documents = [] + self.ids = [] + self.metadata = [] + # Processa o arquivo na inicialização + self.documents, self.ids, self.metadata = self.__call__(file_path, document_content) def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]: - """ - Lê um arquivo e retorna seus documentos, IDs e metadados. - - Args: - file_path (str): Caminho para o arquivo - document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato') - - Returns: - Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados) - """ + path = Path(file_path) if not path.exists(): @@ -33,7 +51,7 @@ def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], Li if path.suffix.lower() not in self.supported_extensions: raise ValueError(f"Extensão não suportada: {path.suffix}") - # Seleciona o método apropriado baseado na extensão + # Resto do código permanece igual readers = { '.json': self._read_json, '.pdf': self._read_pdf, @@ -42,41 +60,26 @@ def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], Li } reader = readers.get(path.suffix.lower()) - documents, content = reader(file_path) - - # Adiciona os documentos ao content para uso no _generate_metadata - content['documents'] = documents + documents = reader(file_path) - # Gera IDs e metadados ids = [f"{document_content}_{i+1}" for i in range(len(documents))] - metadata = self._generate_metadata(content, document_content, path.name) + metadata = [{'document_content': document_content} for _ in range(len(documents))] return documents, ids, metadata def _read_json(self, file_path: str) -> Tuple[List[str], Dict]: - """Lê arquivo JSON.""" + with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) if isinstance(data, list): documents = [item.get('texto', '') for item in data] - content = { - 'titulos': [item.get('titulo', '') for item in data], - 'subtitulos': [item.get('subtitulo', '') for item in data], - 'datas': [item.get('data', '') for item in data] - } else: documents = [data.get('texto', '')] - content = { - 'titulos': [data.get('titulo', '')], - 'subtitulos': [data.get('subtitulo', '')], - 'datas': [data.get('data', '')] - } - - return documents, content + return documents def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]: - """Lê arquivo PDF como um único documento.""" + reader = PdfReader(file_path) # Combina todo o texto do PDF em um único documento @@ -85,14 +88,22 @@ def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]: # Cria uma lista com um único documento documents = [full_text] - # Metadata com informações do PDF - content = { - 'total_pages': len(reader.pages), - 'file_name': Path(file_path).name, - 'file_type': 'pdf', - 'file_size': Path(file_path).stat().st_size, # tamanho em bytes - 'created_at': str(Path(file_path).stat().st_ctime), # data de criação - 'modified_at': str(Path(file_path).stat().st_mtime) # data de modificação - } + return documents + + def _read_docx(self, file_path: str) -> Tuple[List[str], Dict]: + + doc = Document(file_path) + # Combina todos os parágrafos em um único texto, separando por espaços + full_text = " ".join(paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()) + # Retorna uma lista com um único documento + documents = [full_text] + + return documents + + def _read_txt(self, file_path: str) -> Tuple[List[str], Dict]: + + with open(file_path, 'r', encoding='utf-8') as file: + text = file.read() + documents = [text] - return documents, content \ No newline at end of file + return documents From 850ba02572ac910ed3e65fcdb80731846a9bcb1d Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Fri, 14 Feb 2025 19:28:15 -0300 Subject: [PATCH 04/15] [feat] update file reader --- src/services/docmuent_extration/extractor.py | 122 ++++++++++++++++--- 1 file changed, 103 insertions(+), 19 deletions(-) diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py index 0cbed06..7e99bdb 100644 --- a/src/services/docmuent_extration/extractor.py +++ b/src/services/docmuent_extration/extractor.py @@ -4,8 +4,23 @@ from docx import Document from PyPDF2 import PdfReader import re +from dataclasses import dataclass -class Reader: +@dataclass +class DocumentResult: + """ + Classe para armazenar o resultado da leitura de documentos. + + Attributes: + documents (List[str]): Lista de documentos processados + ids (List[str]): Lista de identificadores únicos + metadata (List[Dict]): Lista de metadados associados + """ + documents: List[str] + ids: List[str] + metadata: List[Dict] + +class DocumentFileReader: """ Classe para leitura e processamento de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT). @@ -32,17 +47,38 @@ class Reader: """ def __init__(self, file_path: str, document_content: str): - + """ + Inicializa o leitor de documentos. + + Args: + file_path (str): Caminho do arquivo a ser lido + document_content (str): Tipo de conteúdo do documento (ex: "noticia", "artigo") + + Raises: + FileNotFoundError: Se o arquivo não for encontrado + ValueError: Se a extensão do arquivo não for suportada + """ self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} - # Inicializa os atributos - self.documents = [] - self.ids = [] - self.metadata = [] - # Processa o arquivo na inicialização - self.documents, self.ids, self.metadata = self.__call__(file_path, document_content) + result = self.__call__(file_path, document_content) + self.documents = result.documents + self.ids = result.ids + self.metadata = result.metadata - def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]: + def __call__(self, file_path: str, document_content: str) -> DocumentResult: + """ + Processa o arquivo e retorna os resultados estruturados. + Args: + file_path (str): Caminho do arquivo a ser lido + document_content (str): Tipo de conteúdo do documento + + Returns: + DocumentResult: Objeto contendo documentos, IDs e metadados + + Raises: + FileNotFoundError: Se o arquivo não for encontrado + ValueError: Se a extensão do arquivo não for suportada + """ path = Path(file_path) if not path.exists(): @@ -62,13 +98,25 @@ def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], Li reader = readers.get(path.suffix.lower()) documents = reader(file_path) - ids = [f"{document_content}_{i+1}" for i in range(len(documents))] + ids = f"{document_content}" metadata = [{'document_content': document_content} for _ in range(len(documents))] - return documents, ids, metadata + return DocumentResult(documents=documents, ids=ids, metadata=metadata) - def _read_json(self, file_path: str) -> Tuple[List[str], Dict]: - + def _read_json(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos JSON. + + Args: + file_path (str): Caminho do arquivo JSON + + Returns: + List[str]: Lista de documentos extraídos do JSON + + Notes: + - Espera-se que o JSON contenha uma chave 'texto' em cada item + - Pode processar tanto JSON único quanto lista de JSONs + """ with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) @@ -78,8 +126,20 @@ def _read_json(self, file_path: str) -> Tuple[List[str], Dict]: documents = [data.get('texto', '')] return documents - def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]: - + def _read_pdf(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos PDF. + + Args: + file_path (str): Caminho do arquivo PDF + + Returns: + List[str]: Lista contendo o texto completo do PDF + + Notes: + - Todo o conteúdo do PDF é combinado em um único documento + - Páginas vazias são ignoradas + """ reader = PdfReader(file_path) # Combina todo o texto do PDF em um único documento @@ -90,8 +150,20 @@ def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]: return documents - def _read_docx(self, file_path: str) -> Tuple[List[str], Dict]: - + def _read_docx(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos DOCX. + + Args: + file_path (str): Caminho do arquivo DOCX + + Returns: + List[str]: Lista contendo o texto completo do documento + + Notes: + - Parágrafos vazios são ignorados + - Todo o conteúdo é combinado em um único documento + """ doc = Document(file_path) # Combina todos os parágrafos em um único texto, separando por espaços full_text = " ".join(paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip()) @@ -100,10 +172,22 @@ def _read_docx(self, file_path: str) -> Tuple[List[str], Dict]: return documents - def _read_txt(self, file_path: str) -> Tuple[List[str], Dict]: - + def _read_txt(self, file_path: str) -> List[str]: + """ + Lê e processa arquivos de texto. + + Args: + file_path (str): Caminho do arquivo TXT + + Returns: + List[str]: Lista contendo o texto completo do arquivo + + Notes: + - O arquivo inteiro é tratado como um único documento + """ with open(file_path, 'r', encoding='utf-8') as file: text = file.read() documents = [text] return documents + \ No newline at end of file From 29083395758510dc0c292a10130d4c3fd82f1458 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 19 Feb 2025 17:57:06 -0300 Subject: [PATCH 05/15] [feat] change port from 8000 to 8001 --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index 9e0807d..04d65fb 100644 --- a/run.py +++ b/run.py @@ -4,4 +4,4 @@ app = create_app() if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8000) + uvicorn.run(app, host="0.0.0.0", port=8001) From c1b2b1d18b80f53517ad0310d2a17c68bcb1b771 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 19 Feb 2025 17:58:44 -0300 Subject: [PATCH 06/15] [feat] add document_router --- src/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index 1cd2aaa..4e2d5b7 100644 --- a/src/main.py +++ b/src/main.py @@ -1,10 +1,10 @@ from fastapi import FastAPI -from src.infrastructure.database import MongoDB +from src.infrastructure.database.mongodb.connector import MongoDB from src.infrastructure.config.llm import LLM from src.services.llama_guard import LlamaGuard -from src.api.routes import chat_router +from src.api.routes import chat_router, document_router def create_app(): @@ -17,5 +17,6 @@ def create_app(): # including routes app.include_router(chat_router) + app.include_router(document_router) return app From ab33355fe5f118bd934b956b17e7b1c014180a97 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 19 Feb 2025 18:03:21 -0300 Subject: [PATCH 07/15] [feat] add document management routes --- src/api/routes/route_document.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/api/routes/route_document.py diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py new file mode 100644 index 0000000..8c8d9dc --- /dev/null +++ b/src/api/routes/route_document.py @@ -0,0 +1,29 @@ +from fastapi import APIRouter, status, Request, Depends, HTTPException +from src.api.models import APIResponse +from src.api.models.documents_request import DocumentRequest +from typing import Optional, Dict +from src.api.controllers.controller_document import add_documents +router = APIRouter( + prefix="/documents", + tags=["documents"] +) + +@router.post("/add_documents", status_code=status.HTTP_200_OK) +async def add_document_route(doc_request: DocumentRequest) -> APIResponse: + try: + result = await add_documents( + doc_request.file_path, + doc_request.document_content, + doc_request.collection_name + ) + + return APIResponse( + status_code=200, + response=result + ) + + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) \ No newline at end of file From 0a0bd5453f844106bafc68ea14b6f95e1455707f Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 19 Feb 2025 18:04:01 -0300 Subject: [PATCH 08/15] [feat] add document management requests --- src/api/models/documents_request.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 src/api/models/documents_request.py diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py new file mode 100644 index 0000000..fb5e6c1 --- /dev/null +++ b/src/api/models/documents_request.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + +class DocumentRequest(BaseModel): + file_path: str + document_content: str + collection_name: str \ No newline at end of file From 1f6e4e7890d597ee997256e75aa1ed66df91ea6d Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 19 Feb 2025 18:05:04 -0300 Subject: [PATCH 09/15] [feat] add document management controller --- src/api/controllers/controller_document.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/api/controllers/controller_document.py diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py new file mode 100644 index 0000000..8bd022f --- /dev/null +++ b/src/api/controllers/controller_document.py @@ -0,0 +1,20 @@ +from src.infrastructure.database.chromadb.conector import ChromaDB +from src.services.docmuent_extration.extractor import DocumentFileReader +from typing import List, Dict + + +async def add_documents( + file_path: str, + document_content: str, + collection_name: str +) -> Dict: + db = ChromaDB() + + reader = DocumentFileReader(file_path, document_content) + result = await db.add_documents( + documents=reader.documents, + collection_name=collection_name, + metadatas=reader.metadata, + ids=reader.ids + ) + return result From a23ed1e9b1e72704752f6bd772e80749e867bb3f Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Wed, 19 Feb 2025 18:07:12 -0300 Subject: [PATCH 10/15] [feat] add document management in inits --- src/api/controllers/__init__.py | 3 ++- src/api/models/__init__.py | 3 ++- src/api/routes/__init__.py | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 src/api/routes/__init__.py diff --git a/src/api/controllers/__init__.py b/src/api/controllers/__init__.py index 3393a26..2f18c19 100644 --- a/src/api/controllers/__init__.py +++ b/src/api/controllers/__init__.py @@ -1,4 +1,5 @@ from .chat import new_message as chat_new_message from .guardrails import Guardrail +from .controller_document import add_documents as add_documents -__all__ = ["chat_new_message", "Guardrail"] +__all__ = ["chat_new_message", "add_documents", "Guardrail"] diff --git a/src/api/models/__init__.py b/src/api/models/__init__.py index f19d09f..aa52eab 100644 --- a/src/api/models/__init__.py +++ b/src/api/models/__init__.py @@ -1,3 +1,4 @@ from .api import APIResponse, APIRequest +from .documents_request import DocumentRequest -__all__ = ["APIResponse", "APIRequest"] +__all__ = ["APIResponse", "APIRequest", "DocumentRequest"] diff --git a/src/api/routes/__init__.py b/src/api/routes/__init__.py new file mode 100644 index 0000000..b1efb50 --- /dev/null +++ b/src/api/routes/__init__.py @@ -0,0 +1,6 @@ +from src.api.routes.route_document import router as document_router +from src.api.routes.chat import router as chat_router + + +__all__ = ["chat_router", "document_router"] + From 071181a39bedb60161a221ad33681345b0a0bbd7 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Thu, 20 Feb 2025 21:30:49 -0300 Subject: [PATCH 11/15] [feat] add delete and querry route --- src/api/controllers/controller_document.py | 29 +++++++++++++- src/api/models/__init__.py | 4 +- src/api/models/documents_request.py | 13 +++++- src/api/routes/route_document.py | 46 ++++++++++++++++++++-- 4 files changed, 84 insertions(+), 8 deletions(-) diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py index 8bd022f..3204647 100644 --- a/src/api/controllers/controller_document.py +++ b/src/api/controllers/controller_document.py @@ -1,14 +1,14 @@ from src.infrastructure.database.chromadb.conector import ChromaDB from src.services.docmuent_extration.extractor import DocumentFileReader -from typing import List, Dict +from typing import List, Dict, Optional async def add_documents( + db: ChromaDB, file_path: str, document_content: str, collection_name: str ) -> Dict: - db = ChromaDB() reader = DocumentFileReader(file_path, document_content) result = await db.add_documents( @@ -18,3 +18,28 @@ async def add_documents( ids=reader.ids ) return result + +async def query_documents( + db: ChromaDB, + query_text: str, + collection_name: str, + n_results: int = 5, + where: Optional[dict] = None +): + result = await db.query_documents( + query_text=query_text, + collection_name=collection_name, + n_results=n_results, + where=where) + return result + +async def delete_documents( + db: ChromaDB, + ids: List[str], + collection_name: str +): + result = await db.delete_documents( + ids=ids, + collection_name=collection_name + ) + return result \ No newline at end of file diff --git a/src/api/models/__init__.py b/src/api/models/__init__.py index aa52eab..c8a1b37 100644 --- a/src/api/models/__init__.py +++ b/src/api/models/__init__.py @@ -1,4 +1,4 @@ from .api import APIResponse, APIRequest -from .documents_request import DocumentRequest +from .documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest -__all__ = ["APIResponse", "APIRequest", "DocumentRequest"] +__all__ = ["APIResponse", "APIRequest", "AddDocumentRequest", "QueryDocumentRequest", "DeleteDocumentRequest"] diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py index fb5e6c1..010c6b4 100644 --- a/src/api/models/documents_request.py +++ b/src/api/models/documents_request.py @@ -1,6 +1,17 @@ from pydantic import BaseModel +from typing import Optional, List, Dict -class DocumentRequest(BaseModel): +class AddDocumentRequest(BaseModel): file_path: str document_content: str + collection_name: str + +class QueryDocumentRequest(BaseModel): + query_text: str + collection_name: str + n_results: Optional[int] = 5 + where: Optional[Dict] = None + +class DeleteDocumentRequest(BaseModel): + ids: List[str] collection_name: str \ No newline at end of file diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py index 8c8d9dc..af5b205 100644 --- a/src/api/routes/route_document.py +++ b/src/api/routes/route_document.py @@ -1,17 +1,19 @@ from fastapi import APIRouter, status, Request, Depends, HTTPException from src.api.models import APIResponse -from src.api.models.documents_request import DocumentRequest +from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest from typing import Optional, Dict -from src.api.controllers.controller_document import add_documents +from src.api.controllers.controller_document import add_documents, query_documents, delete_documents + router = APIRouter( prefix="/documents", tags=["documents"] ) @router.post("/add_documents", status_code=status.HTTP_200_OK) -async def add_document_route(doc_request: DocumentRequest) -> APIResponse: +async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> APIResponse: try: result = await add_documents( + req.app.vectordb, doc_request.file_path, doc_request.document_content, doc_request.collection_name @@ -22,6 +24,44 @@ async def add_document_route(doc_request: DocumentRequest) -> APIResponse: response=result ) + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) + +@router.post("/query_documents", status_code=status.HTTP_200_OK) +async def query_document_route(doc_request: QueryDocumentRequest, req: Request) -> APIResponse: + try: + result = await query_documents( + req.app.vectordb, + doc_request.query_text, + doc_request.collection_name, + doc_request.n_results, + doc_request.where + ) + return APIResponse( + status_code=200, + response=result + ) + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) + +@router.delete("/delete_documents", status_code=status.HTTP_200_OK) +async def delete_document_route(doc_request: DeleteDocumentRequest, req: Request) -> APIResponse: + try: + result = await delete_documents( + req.app.vectordb, + doc_request.ids, + doc_request.collection_name + ) + return APIResponse( + status_code=200, + response=result + ) except Exception as e: return APIResponse( status_code=500, From a22d83cf2352b50c13d5e82a503d379c63d189aa Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Sun, 23 Feb 2025 13:58:36 -0300 Subject: [PATCH 12/15] [feat] add list documents and collections routes --- src/api/controllers/controller_document.py | 22 +++++++++-- src/api/models/documents_request.py | 10 ++++- src/api/routes/route_document.py | 37 +++++++++++++++++-- .../database/chromadb/conector.py | 17 ++++++++- src/main.py | 3 +- 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py index 3204647..63ccccd 100644 --- a/src/api/controllers/controller_document.py +++ b/src/api/controllers/controller_document.py @@ -6,11 +6,12 @@ async def add_documents( db: ChromaDB, file_path: str, - document_content: str, - collection_name: str + tags_documnets: str, + collection_name: str, + documnet_id: Optional[List[str]] = None ) -> Dict: - reader = DocumentFileReader(file_path, document_content) + reader = DocumentFileReader(file_path, tags_documnets, documnet_id) result = await db.add_documents( documents=reader.documents, collection_name=collection_name, @@ -42,4 +43,19 @@ async def delete_documents( ids=ids, collection_name=collection_name ) + return result + +async def list_collections( + db: ChromaDB +): + result = await db.list_collections() + return result + +async def list_documents( + db: ChromaDB, + collection_name: str, +): + result = await db.list_documents( + collection_name=collection_name + ) return result \ No newline at end of file diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py index 010c6b4..47f815b 100644 --- a/src/api/models/documents_request.py +++ b/src/api/models/documents_request.py @@ -3,7 +3,7 @@ class AddDocumentRequest(BaseModel): file_path: str - document_content: str + tags_documnets: str collection_name: str class QueryDocumentRequest(BaseModel): @@ -14,4 +14,10 @@ class QueryDocumentRequest(BaseModel): class DeleteDocumentRequest(BaseModel): ids: List[str] - collection_name: str \ No newline at end of file + collection_name: str + +class ListDocumentsRequest(BaseModel): + collection_name: str + +class ListCollectionsRequest(BaseModel): + pass \ No newline at end of file diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py index af5b205..74141ee 100644 --- a/src/api/routes/route_document.py +++ b/src/api/routes/route_document.py @@ -1,8 +1,8 @@ from fastapi import APIRouter, status, Request, Depends, HTTPException from src.api.models import APIResponse -from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest +from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, ListDocumentsRequest, ListCollectionsRequest, DeleteDocumentRequest from typing import Optional, Dict -from src.api.controllers.controller_document import add_documents, query_documents, delete_documents +from src.api.controllers.controller_document import add_documents, query_documents, delete_documents, list_collections, list_documents router = APIRouter( prefix="/documents", @@ -15,7 +15,7 @@ async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> A result = await add_documents( req.app.vectordb, doc_request.file_path, - doc_request.document_content, + doc_request.tags_documnets, doc_request.collection_name ) @@ -50,6 +50,37 @@ async def query_document_route(doc_request: QueryDocumentRequest, req: Request) status_message=f"Error detalhado: {str(e)}" ) +@router.get("/list_collections", status_code=status.HTTP_200_OK) +async def list_collections_route(req: Request): + try: + result = await list_collections(req.app.vectordb) + return { + "status_code": 200, + "response": result # Retorna diretamente a lista de nomes das coleções + } + except Exception as e: + return { + "status_code": 500, + "status_message": f"Erro detalhado: {str(e)}" + } + +@router.post("/list_documents", status_code=status.HTTP_200_OK) +async def list_documents_route(doc_request: ListDocumentsRequest, req: Request) -> APIResponse: + try: + result = await list_documents( + req.app.vectordb, + doc_request.collection_name + ) + return APIResponse( + status_code=200, + response=result + ) + except Exception as e: + return APIResponse( + status_code=500, + status_message=f"Error detalhado: {str(e)}" + ) + @router.delete("/delete_documents", status_code=status.HTTP_200_OK) async def delete_document_route(doc_request: DeleteDocumentRequest, req: Request) -> APIResponse: try: diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py index 155ff43..1c808b9 100644 --- a/src/infrastructure/database/chromadb/conector.py +++ b/src/infrastructure/database/chromadb/conector.py @@ -87,6 +87,21 @@ async def query_documents( n_results=n_results, where=where ) + + async def list_collections(self): + """List all collections in ChromaDB. + + Returns: + dict: List of collections + """ + return self.client.list_collections() + + async def list_documents(self, collection_name: str): + """List all documents in a + collection. Args: collection_name (str): Name of the collection to list documents from Returns: dict: List of documents in the collection""" + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.get(include=["documents", "metadatas"]) async def delete_documents(self, ids: List[str],collection_name: str): """Delete documents from collection by IDs. @@ -104,7 +119,7 @@ async def delete_documents(self, ids: List[str],collection_name: str): self.collection = self._create_collection(collection_name) return self.collection.delete(ids=ids) - async def close(self): + def __del__(self): """Close ChromaDB connection and clean up resources.""" if self.client: self.client.reset() \ No newline at end of file diff --git a/src/main.py b/src/main.py index 4e2d5b7..764888d 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,5 @@ from fastapi import FastAPI - +from src.infrastructure.database.chromadb.conector import ChromaDB from src.infrastructure.database.mongodb.connector import MongoDB from src.infrastructure.config.llm import LLM from src.services.llama_guard import LlamaGuard @@ -11,6 +11,7 @@ def create_app(): app = FastAPI() # defining API variables + app.vectordb = ChromaDB() app.database = MongoDB() app.llm = LLM() app.llama_guard = LlamaGuard() From 4887e850594745da7189fd1b8158cd5bc85f4244 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Sun, 23 Feb 2025 14:01:06 -0300 Subject: [PATCH 13/15] [feat] update document ID generation method --- src/services/docmuent_extration/extractor.py | 28 +++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py index 7e99bdb..605f16a 100644 --- a/src/services/docmuent_extration/extractor.py +++ b/src/services/docmuent_extration/extractor.py @@ -46,31 +46,31 @@ class DocumentFileReader: - Arquivos TXT são tratados como um único documento """ - def __init__(self, file_path: str, document_content: str): + def __init__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None): """ Inicializa o leitor de documentos. Args: file_path (str): Caminho do arquivo a ser lido - document_content (str): Tipo de conteúdo do documento (ex: "noticia", "artigo") + tags_documnets (str): Tipo de conteúdo do documento (ex: "noticia", "artigo") Raises: FileNotFoundError: Se o arquivo não for encontrado ValueError: Se a extensão do arquivo não for suportada """ self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} - result = self.__call__(file_path, document_content) + result = self.__call__(file_path, tags_documnets, documnet_id) self.documents = result.documents self.ids = result.ids self.metadata = result.metadata - def __call__(self, file_path: str, document_content: str) -> DocumentResult: + def __call__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None) -> DocumentResult: """ Processa o arquivo e retorna os resultados estruturados. Args: file_path (str): Caminho do arquivo a ser lido - document_content (str): Tipo de conteúdo do documento + tags_documnets (str): Tipo de conteúdo do documento Returns: DocumentResult: Objeto contendo documentos, IDs e metadados @@ -87,7 +87,6 @@ def __call__(self, file_path: str, document_content: str) -> DocumentResult: if path.suffix.lower() not in self.supported_extensions: raise ValueError(f"Extensão não suportada: {path.suffix}") - # Resto do código permanece igual readers = { '.json': self._read_json, '.pdf': self._read_pdf, @@ -98,11 +97,20 @@ def __call__(self, file_path: str, document_content: str) -> DocumentResult: reader = readers.get(path.suffix.lower()) documents = reader(file_path) - ids = f"{document_content}" - metadata = [{'document_content': document_content} for _ in range(len(documents))] + # Gerando IDs únicos + if documnet_id is None: + ids = [f"{tags_documnets}_{i}" for i in range(len(documents))] + else: + ids = [documnet_id] + metadata = [{'id': doc_id, 'tags_documnets': tags_documnets} for doc_id in ids] - return DocumentResult(documents=documents, ids=ids, metadata=metadata) - + # Retornando o DocumentResult + return DocumentResult( + documents=documents, + ids=ids, + metadata=metadata + ) + def _read_json(self, file_path: str) -> List[str]: """ Lê e processa arquivos JSON. From 0628c8dec4c09f50594def2a4b5f7e7689036035 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Mon, 24 Feb 2025 16:41:50 -0300 Subject: [PATCH 14/15] [bugfix] Fix list function behavior --- src/api/controllers/controller_document.py | 4 ++-- src/api/models/documents_request.py | 1 - src/api/routes/route_document.py | 1 - .../database/chromadb/conector.py | 18 +++++++++--------- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py index 63ccccd..baa7742 100644 --- a/src/api/controllers/controller_document.py +++ b/src/api/controllers/controller_document.py @@ -6,12 +6,12 @@ async def add_documents( db: ChromaDB, file_path: str, - tags_documnets: str, collection_name: str, + metadata: Optional[str] = None, documnet_id: Optional[List[str]] = None ) -> Dict: - reader = DocumentFileReader(file_path, tags_documnets, documnet_id) + reader = DocumentFileReader(file_path, metadata, documnet_id) result = await db.add_documents( documents=reader.documents, collection_name=collection_name, diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py index 47f815b..c3af766 100644 --- a/src/api/models/documents_request.py +++ b/src/api/models/documents_request.py @@ -3,7 +3,6 @@ class AddDocumentRequest(BaseModel): file_path: str - tags_documnets: str collection_name: str class QueryDocumentRequest(BaseModel): diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py index 74141ee..32321dd 100644 --- a/src/api/routes/route_document.py +++ b/src/api/routes/route_document.py @@ -15,7 +15,6 @@ async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> A result = await add_documents( req.app.vectordb, doc_request.file_path, - doc_request.tags_documnets, doc_request.collection_name ) diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py index 1c808b9..adc20a0 100644 --- a/src/infrastructure/database/chromadb/conector.py +++ b/src/infrastructure/database/chromadb/conector.py @@ -18,7 +18,7 @@ def _connect(self): client = chromadb.HttpClient(host=self.host, port=self.port) return client - def _create_collection(self, collection_name: str): + def _get_or_create_collection(self, collection_name: str): """Create or get an existing collection. Args: @@ -52,8 +52,8 @@ async def add_documents( Returns: dict: Result of add operation """ - if not self.collection: - self.collection = self._create_collection(collection_name) + + self.collection = self._get_or_create_collection(collection_name) return self.collection.add( documents=documents, metadatas=metadatas, @@ -80,8 +80,8 @@ async def query_documents( Returns: dict: Query results """ - if not self.collection: - self.collection = self._create_collection(collection_name) + + self.collection = self._get_or_create_collection(collection_name) return self.collection.query( query_texts=[query_text], n_results=n_results, @@ -99,8 +99,8 @@ async def list_collections(self): async def list_documents(self, collection_name: str): """List all documents in a collection. Args: collection_name (str): Name of the collection to list documents from Returns: dict: List of documents in the collection""" - if not self.collection: - self.collection = self._create_collection(collection_name) + + self.collection = self._get_or_create_collection(collection_name) return self.collection.get(include=["documents", "metadatas"]) async def delete_documents(self, ids: List[str],collection_name: str): @@ -115,8 +115,8 @@ async def delete_documents(self, ids: List[str],collection_name: str): Returns: dict: Result of delete operation """ - if not self.collection: - self.collection = self._create_collection(collection_name) + + self.collection = self._get_or_create_collection(collection_name) return self.collection.delete(ids=ids) def __del__(self): From 9bbe70801e5a12ab01c1f0325ba78b2ebe4592e2 Mon Sep 17 00:00:00 2001 From: Pedro HB Ribeiro Date: Mon, 24 Feb 2025 16:43:54 -0300 Subject: [PATCH 15/15] [refactor] Change ID generation logic --- src/services/docmuent_extration/extractor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py index 605f16a..77ba17f 100644 --- a/src/services/docmuent_extration/extractor.py +++ b/src/services/docmuent_extration/extractor.py @@ -1,6 +1,7 @@ import json from typing import List, Dict, Tuple, Optional from pathlib import Path +import uuid from docx import Document from PyPDF2 import PdfReader import re @@ -46,31 +47,31 @@ class DocumentFileReader: - Arquivos TXT são tratados como um único documento """ - def __init__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None): + def __init__(self, file_path: str, metadata: str, documnet_id: Optional[str] = None): """ Inicializa o leitor de documentos. Args: file_path (str): Caminho do arquivo a ser lido - tags_documnets (str): Tipo de conteúdo do documento (ex: "noticia", "artigo") + metadata (str): Tipo de conteúdo do documento (ex: "noticia", "artigo") Raises: FileNotFoundError: Se o arquivo não for encontrado ValueError: Se a extensão do arquivo não for suportada """ self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} - result = self.__call__(file_path, tags_documnets, documnet_id) + result = self.__call__(file_path, metadata, documnet_id) self.documents = result.documents self.ids = result.ids self.metadata = result.metadata - def __call__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None) -> DocumentResult: + def __call__(self, file_path: str, metadata: Optional[str] = None, documnet_id: Optional[str] = None) -> DocumentResult: """ Processa o arquivo e retorna os resultados estruturados. Args: file_path (str): Caminho do arquivo a ser lido - tags_documnets (str): Tipo de conteúdo do documento + metadata (str): Tipo de conteúdo do documento Returns: DocumentResult: Objeto contendo documentos, IDs e metadados @@ -99,10 +100,9 @@ def __call__(self, file_path: str, tags_documnets: str, documnet_id: Optional[st # Gerando IDs únicos if documnet_id is None: - ids = [f"{tags_documnets}_{i}" for i in range(len(documents))] + ids=[str(uuid.uuid4()) for _ in enumerate(documents)] else: ids = [documnet_id] - metadata = [{'id': doc_id, 'tags_documnets': tags_documnets} for doc_id in ids] # Retornando o DocumentResult return DocumentResult(