From e795e6d457ed44011da99ccc187e9871a573dbaa Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 12 Feb 2025 18:54:53 -0300
Subject: [PATCH 01/15] [feat] add chroma conector

---
 .../database/chromadb/conector.py             | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 src/infrastructure/database/chromadb/conector.py

diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py
new file mode 100644
index 0000000..155ff43
--- /dev/null
+++ b/src/infrastructure/database/chromadb/conector.py
@@ -0,0 +1,110 @@
+import chromadb
+from chromadb.config import Settings
+from typing import List, Optional
+
+
+class ChromaDB:
+    """Manager for ChromaDB connection and operations."""
+
+    def __init__(self):
+        """Initialize ChromaDB connection."""
+        self.host = 'localhost'
+        self.port = 8000
+        self.client = self._connect()
+        self.collection = None
+
+    def _connect(self):
+        """Connect to ChromaDB."""
+        client = chromadb.HttpClient(host=self.host, port=self.port)
+        return client
+
+    def _create_collection(self, collection_name: str):
+        """Create or get an existing collection.
+
+        Args:
+            collection_name (str): Name of the collection to create/get
+
+        Returns:
+            Collection: Created/retrieved collection object
+        """
+        self.collection = self.client.get_or_create_collection(
+            name=collection_name
+        )
+        return self.collection
+
+    async def add_documents(
+        self,
+        documents: List[str],
+        collection_name: str,
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None
+    ):
+        """Add documents to the collection.
+
+        Args:
+            documents (List[str]): List of document texts
+            metadatas (Optional[List[dict]]): Document metadata
+            ids (Optional[List[str]]): Unique document IDs
+
+        Raises:
+            ValueError: If collection is not initialized
+
+        Returns:
+            dict: Result of add operation
+        """
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.add(
+            documents=documents,
+            metadatas=metadatas,
+            ids=ids
+        )
+
+    async def query_documents(
+        self,
+        query_text: str,
+        collection_name: str,
+        n_results: int = 5,
+        where: Optional[dict] = None
+    ):
+        """Query similar documents in the collection.
+
+        Args:
+            query_text (str): Text to search for similarity
+            n_results (int): Number of desired results
+            where (Optional[dict]): Additional filters
+
+        Raises:
+            ValueError: If collection is not initialized
+
+        Returns:
+            dict: Query results
+        """
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.query(
+            query_texts=[query_text],
+            n_results=n_results,
+            where=where
+        )
+
+    async def delete_documents(self, ids: List[str],collection_name: str):
+        """Delete documents from collection by IDs.
+
+        Args:
+            ids (List[str]): List of document IDs to delete
+
+        Raises:
+            ValueError: If collection is not initialized
+
+        Returns:
+            dict: Result of delete operation
+        """
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.delete(ids=ids)
+
+    async def close(self):
+        """Close ChromaDB connection and clean up resources."""
+        if self.client:
+            self.client.reset()
\ No newline at end of file

From 1de190d17cb06d4f493af60260fc540d3c2c8cd5 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Thu, 13 Feb 2025 18:00:52 -0300
Subject: [PATCH 02/15] [feat] file reader

---
 src/services/docmuent_extration/extractor.py | 98 ++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 src/services/docmuent_extration/extractor.py

diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py
new file mode 100644
index 0000000..3c81712
--- /dev/null
+++ b/src/services/docmuent_extration/extractor.py
@@ -0,0 +1,98 @@
+import json
+from typing import List, Dict, Tuple, Optional
+from pathlib import Path
+from docx import Document
+from PyPDF2 import PdfReader
+import re
+
+class Reader:
+    """
+    Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT).
+    """
+    
+    def __init__(self):
+        """Inicializa o leitor de documentos."""
+        self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}
+    
+    def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]:
+        """
+        Lê um arquivo e retorna seus documentos, IDs e metadados.
+        
+        Args:
+            file_path (str): Caminho para o arquivo
+            document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato')
+            
+        Returns:
+            Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados)
+        """
+        path = Path(file_path)
+        
+        if not path.exists():
+            raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
+            
+        if path.suffix.lower() not in self.supported_extensions:
+            raise ValueError(f"Extensão não suportada: {path.suffix}")
+        
+        # Seleciona o método apropriado baseado na extensão
+        readers = {
+            '.json': self._read_json,
+            '.pdf': self._read_pdf,
+            '.docx': self._read_docx,
+            '.txt': self._read_txt
+        }
+        
+        reader = readers.get(path.suffix.lower())
+        documents, content = reader(file_path)
+        
+        # Adiciona os documentos ao content para uso no _generate_metadata
+        content['documents'] = documents
+        
+        # Gera IDs e metadados
+        ids = [f"{document_content}_{i+1}" for i in range(len(documents))]
+        metadata = self._generate_metadata(content, document_content, path.name)
+        
+        return documents, ids, metadata
+    
+    def _read_json(self, file_path: str) -> Tuple[List[str], Dict]:
+        """Lê arquivo JSON."""
+        with open(file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+            
+        if isinstance(data, list):
+            documents = [item.get('texto', '') for item in data]
+            content = {
+                'titulos': [item.get('titulo', '') for item in data],
+                'subtitulos': [item.get('subtitulo', '') for item in data],
+                'datas': [item.get('data', '') for item in data]
+            }
+        else:
+            documents = [data.get('texto', '')]
+            content = {
+                'titulos': [data.get('titulo', '')],
+                'subtitulos': [data.get('subtitulo', '')],
+                'datas': [data.get('data', '')]
+            }
+            
+        return documents, content
+    
+    def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
+        """Lê arquivo PDF como um único documento."""
+        reader = PdfReader(file_path)
+
+        # Combina todo o texto do PDF em um único documento
+        full_text = " ".join(page.extract_text() or "" for page in reader.pages).strip()
+
+        # Cria uma lista com um único documento
+        documents = [full_text]
+
+        # Metadata com informações do PDF
+        content = {
+            'total_pages': len(reader.pages),
+            'file_name': Path(file_path).name,
+            'file_type': 'pdf',
+            'file_size': Path(file_path).stat().st_size,  # tamanho em bytes
+            'created_at': str(Path(file_path).stat().st_ctime),  # data de criação
+            'modified_at': str(Path(file_path).stat().st_mtime)  # data de modificação
+        }
+
+        return documents, content
\ No newline at end of file

From 3634b668ad5be93777342643d147870559913f9d Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Fri, 14 Feb 2025 17:07:28 -0300
Subject: [PATCH 03/15] [feat] uploads in file reader

---
 src/services/docmuent_extration/extractor.py | 99 +++++++++++---------
 1 file changed, 55 insertions(+), 44 deletions(-)

diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py
index 3c81712..0cbed06 100644
--- a/src/services/docmuent_extration/extractor.py
+++ b/src/services/docmuent_extration/extractor.py
@@ -7,24 +7,42 @@
 
 class Reader:
     """
-    Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT).
+    Classe para leitura e processamento de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT).
+    
+    Esta classe fornece uma interface unificada para ler diferentes formatos de arquivo,
+    processando-os em documentos individuais e gerando identificadores únicos e metadados
+    para cada documento.
+
+    Attributes:
+        supported_extensions (set): Extensões de arquivo suportadas (.json, .pdf, .docx, .txt)
+        documents (list): Lista dos documentos processados
+        ids (list): Lista de identificadores únicos para cada documento
+        metadata (list): Lista de metadados associados a cada documento
+
+    Example:
+        >>> reader = Reader("documento.pdf", "noticia")
+        >>> print(len(reader.documents))  # número de documentos
+        >>> print(reader.ids)  # lista de IDs gerados
+        >>> print(reader.metadata)  # metadados dos documentos
+
+    Note:
+        - Arquivos PDF e DOCX são tratados como um único documento
+        - Arquivos JSON podem conter múltiplos documentos
+        - Arquivos TXT são tratados como um único documento
     """
     
-    def __init__(self):
-        """Inicializa o leitor de documentos."""
+    def __init__(self, file_path: str, document_content: str):
+     
         self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}
+        # Inicializa os atributos
+        self.documents = []
+        self.ids = []
+        self.metadata = []
+        # Processa o arquivo na inicialização
+        self.documents, self.ids, self.metadata = self.__call__(file_path, document_content)
     
     def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]:
-        """
-        Lê um arquivo e retorna seus documentos, IDs e metadados.
-        
-        Args:
-            file_path (str): Caminho para o arquivo
-            document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato')
-            
-        Returns:
-            Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados)
-        """
+
         path = Path(file_path)
         
         if not path.exists():
@@ -33,7 +51,7 @@ def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], Li
         if path.suffix.lower() not in self.supported_extensions:
             raise ValueError(f"Extensão não suportada: {path.suffix}")
         
-        # Seleciona o método apropriado baseado na extensão
+        # Resto do código permanece igual
         readers = {
             '.json': self._read_json,
             '.pdf': self._read_pdf,
@@ -42,41 +60,26 @@ def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], Li
         }
         
         reader = readers.get(path.suffix.lower())
-        documents, content = reader(file_path)
-        
-        # Adiciona os documentos ao content para uso no _generate_metadata
-        content['documents'] = documents
+        documents = reader(file_path)
         
-        # Gera IDs e metadados
         ids = [f"{document_content}_{i+1}" for i in range(len(documents))]
-        metadata = self._generate_metadata(content, document_content, path.name)
+        metadata = [{'document_content': document_content} for _ in range(len(documents))]
         
         return documents, ids, metadata
     
     def _read_json(self, file_path: str) -> Tuple[List[str], Dict]:
-        """Lê arquivo JSON."""
+        
         with open(file_path, 'r', encoding='utf-8') as file:
             data = json.load(file)
             
         if isinstance(data, list):
             documents = [item.get('texto', '') for item in data]
-            content = {
-                'titulos': [item.get('titulo', '') for item in data],
-                'subtitulos': [item.get('subtitulo', '') for item in data],
-                'datas': [item.get('data', '') for item in data]
-            }
         else:
             documents = [data.get('texto', '')]
-            content = {
-                'titulos': [data.get('titulo', '')],
-                'subtitulos': [data.get('subtitulo', '')],
-                'datas': [data.get('data', '')]
-            }
-            
-        return documents, content
+        return documents
     
     def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
-        """Lê arquivo PDF como um único documento."""
+        
         reader = PdfReader(file_path)
 
         # Combina todo o texto do PDF em um único documento
@@ -85,14 +88,22 @@ def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
         # Cria uma lista com um único documento
         documents = [full_text]
 
-        # Metadata com informações do PDF
-        content = {
-            'total_pages': len(reader.pages),
-            'file_name': Path(file_path).name,
-            'file_type': 'pdf',
-            'file_size': Path(file_path).stat().st_size,  # tamanho em bytes
-            'created_at': str(Path(file_path).stat().st_ctime),  # data de criação
-            'modified_at': str(Path(file_path).stat().st_mtime)  # data de modificação
-        }
+        return documents
+
+    def _read_docx(self, file_path: str) -> Tuple[List[str], Dict]:
+        
+        doc = Document(file_path)
+        # Combina todos os parágrafos em um único texto, separando por espaços
+        full_text = " ".join(paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip())
+        # Retorna uma lista com um único documento
+        documents = [full_text]
+
+        return documents
+
+    def _read_txt(self, file_path: str) -> Tuple[List[str], Dict]:
+        
+        with open(file_path, 'r', encoding='utf-8') as file:
+            text = file.read()
+            documents = [text]
 
-        return documents, content
\ No newline at end of file
+        return documents

From 850ba02572ac910ed3e65fcdb80731846a9bcb1d Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Fri, 14 Feb 2025 19:28:15 -0300
Subject: [PATCH 04/15] [feat] update file reader

---
 src/services/docmuent_extration/extractor.py | 122 ++++++++++++++++---
 1 file changed, 103 insertions(+), 19 deletions(-)

diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py
index 0cbed06..7e99bdb 100644
--- a/src/services/docmuent_extration/extractor.py
+++ b/src/services/docmuent_extration/extractor.py
@@ -4,8 +4,23 @@
 from docx import Document
 from PyPDF2 import PdfReader
 import re
+from dataclasses import dataclass
 
-class Reader:
+@dataclass
+class DocumentResult:
+    """
+    Classe para armazenar o resultado da leitura de documentos.
+    
+    Attributes:
+        documents (List[str]): Lista de documentos processados
+        ids (List[str]): Lista de identificadores únicos
+        metadata (List[Dict]): Lista de metadados associados
+    """
+    documents: List[str]
+    ids: List[str]
+    metadata: List[Dict]
+
+class DocumentFileReader:
     """
     Classe para leitura e processamento de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT).
     
@@ -32,17 +47,38 @@ class Reader:
     """
     
     def __init__(self, file_path: str, document_content: str):
-     
+        """
+        Inicializa o leitor de documentos.
+
+        Args:
+            file_path (str): Caminho do arquivo a ser lido
+            document_content (str): Tipo de conteúdo do documento (ex: "noticia", "artigo")
+
+        Raises:
+            FileNotFoundError: Se o arquivo não for encontrado
+            ValueError: Se a extensão do arquivo não for suportada
+        """
         self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}
-        # Inicializa os atributos
-        self.documents = []
-        self.ids = []
-        self.metadata = []
-        # Processa o arquivo na inicialização
-        self.documents, self.ids, self.metadata = self.__call__(file_path, document_content)
+        result = self.__call__(file_path, document_content)
+        self.documents = result.documents
+        self.ids = result.ids
+        self.metadata = result.metadata
     
-    def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]:
+    def __call__(self, file_path: str, document_content: str) -> DocumentResult:
+        """
+        Processa o arquivo e retorna os resultados estruturados.
 
+        Args:
+            file_path (str): Caminho do arquivo a ser lido
+            document_content (str): Tipo de conteúdo do documento
+
+        Returns:
+            DocumentResult: Objeto contendo documentos, IDs e metadados
+
+        Raises:
+            FileNotFoundError: Se o arquivo não for encontrado
+            ValueError: Se a extensão do arquivo não for suportada
+        """
         path = Path(file_path)
         
         if not path.exists():
@@ -62,13 +98,25 @@ def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], Li
         reader = readers.get(path.suffix.lower())
         documents = reader(file_path)
         
-        ids = [f"{document_content}_{i+1}" for i in range(len(documents))]
+        ids = f"{document_content}"
         metadata = [{'document_content': document_content} for _ in range(len(documents))]
         
-        return documents, ids, metadata
+        return DocumentResult(documents=documents, ids=ids, metadata=metadata)
     
-    def _read_json(self, file_path: str) -> Tuple[List[str], Dict]:
-        
+    def _read_json(self, file_path: str) -> List[str]:
+        """
+        Lê e processa arquivos JSON.
+
+        Args:
+            file_path (str): Caminho do arquivo JSON
+
+        Returns:
+            List[str]: Lista de documentos extraídos do JSON
+
+        Notes:
+            - Espera-se que o JSON contenha uma chave 'texto' em cada item
+            - Pode processar tanto JSON único quanto lista de JSONs
+        """
         with open(file_path, 'r', encoding='utf-8') as file:
             data = json.load(file)
             
@@ -78,8 +126,20 @@ def _read_json(self, file_path: str) -> Tuple[List[str], Dict]:
             documents = [data.get('texto', '')]
         return documents
     
-    def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
-        
+    def _read_pdf(self, file_path: str) -> List[str]:
+        """
+        Lê e processa arquivos PDF.
+
+        Args:
+            file_path (str): Caminho do arquivo PDF
+
+        Returns:
+            List[str]: Lista contendo o texto completo do PDF
+
+        Notes:
+            - Todo o conteúdo do PDF é combinado em um único documento
+            - Páginas vazias são ignoradas
+        """
         reader = PdfReader(file_path)
 
         # Combina todo o texto do PDF em um único documento
@@ -90,8 +150,20 @@ def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
 
         return documents
 
-    def _read_docx(self, file_path: str) -> Tuple[List[str], Dict]:
-        
+    def _read_docx(self, file_path: str) -> List[str]:
+        """
+        Lê e processa arquivos DOCX.
+
+        Args:
+            file_path (str): Caminho do arquivo DOCX
+
+        Returns:
+            List[str]: Lista contendo o texto completo do documento
+
+        Notes:
+            - Parágrafos vazios são ignorados
+            - Todo o conteúdo é combinado em um único documento
+        """
         doc = Document(file_path)
         # Combina todos os parágrafos em um único texto, separando por espaços
         full_text = " ".join(paragraph.text.strip() for paragraph in doc.paragraphs if paragraph.text.strip())
@@ -100,10 +172,22 @@ def _read_docx(self, file_path: str) -> Tuple[List[str], Dict]:
 
         return documents
 
-    def _read_txt(self, file_path: str) -> Tuple[List[str], Dict]:
-        
+    def _read_txt(self, file_path: str) -> List[str]:
+        """
+        Lê e processa arquivos de texto.
+
+        Args:
+            file_path (str): Caminho do arquivo TXT
+
+        Returns:
+            List[str]: Lista contendo o texto completo do arquivo
+
+        Notes:
+            - O arquivo inteiro é tratado como um único documento
+        """
         with open(file_path, 'r', encoding='utf-8') as file:
             text = file.read()
             documents = [text]
 
         return documents
+    
\ No newline at end of file

From 29083395758510dc0c292a10130d4c3fd82f1458 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 19 Feb 2025 17:57:06 -0300
Subject: [PATCH 05/15] [feat] change port from 8000 to 8001

---
 run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.py b/run.py
index 9e0807d..04d65fb 100644
--- a/run.py
+++ b/run.py
@@ -4,4 +4,4 @@
 app = create_app()
 
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run(app, host="0.0.0.0", port=8001)

From c1b2b1d18b80f53517ad0310d2a17c68bcb1b771 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 19 Feb 2025 17:58:44 -0300
Subject: [PATCH 06/15] [feat] add document_router

---
 src/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main.py b/src/main.py
index 1cd2aaa..4e2d5b7 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,10 +1,10 @@
 from fastapi import FastAPI
 
-from src.infrastructure.database import MongoDB
+from src.infrastructure.database.mongodb.connector import MongoDB
 from src.infrastructure.config.llm import LLM
 from src.services.llama_guard import LlamaGuard
 
-from src.api.routes import chat_router
+from src.api.routes import chat_router, document_router
 
 
 def create_app():
@@ -17,5 +17,6 @@ def create_app():
 
     # including routes
     app.include_router(chat_router)
+    app.include_router(document_router)
 
     return app

From ab33355fe5f118bd934b956b17e7b1c014180a97 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 19 Feb 2025 18:03:21 -0300
Subject: [PATCH 07/15] [feat] add document management routes

---
 src/api/routes/route_document.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 src/api/routes/route_document.py

diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py
new file mode 100644
index 0000000..8c8d9dc
--- /dev/null
+++ b/src/api/routes/route_document.py
@@ -0,0 +1,29 @@
+from fastapi import APIRouter, status, Request, Depends, HTTPException
+from src.api.models import APIResponse
+from src.api.models.documents_request import DocumentRequest
+from typing import Optional, Dict
+from src.api.controllers.controller_document import add_documents
+router = APIRouter(
+    prefix="/documents",
+    tags=["documents"]
+)
+
+@router.post("/add_documents", status_code=status.HTTP_200_OK)
+async def add_document_route(doc_request: DocumentRequest) -> APIResponse:
+    try:
+        result = await add_documents(
+            doc_request.file_path,
+            doc_request.document_content,
+            doc_request.collection_name
+        )
+
+        return APIResponse(
+            status_code=200,
+            response=result
+        )
+
+    except Exception as e:
+        return APIResponse(
+            status_code=500,
+            status_message=f"Error detalhado: {str(e)}"
+        )
\ No newline at end of file

From 0a0bd5453f844106bafc68ea14b6f95e1455707f Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 19 Feb 2025 18:04:01 -0300
Subject: [PATCH 08/15] [feat] add document management requests

---
 src/api/models/documents_request.py | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 src/api/models/documents_request.py

diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py
new file mode 100644
index 0000000..fb5e6c1
--- /dev/null
+++ b/src/api/models/documents_request.py
@@ -0,0 +1,6 @@
+from pydantic import BaseModel
+
+class DocumentRequest(BaseModel):
+    file_path: str
+    document_content: str
+    collection_name: str
\ No newline at end of file

From 1f6e4e7890d597ee997256e75aa1ed66df91ea6d Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 19 Feb 2025 18:05:04 -0300
Subject: [PATCH 09/15] [feat] add document management controller

---
 src/api/controllers/controller_document.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 src/api/controllers/controller_document.py

diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py
new file mode 100644
index 0000000..8bd022f
--- /dev/null
+++ b/src/api/controllers/controller_document.py
@@ -0,0 +1,20 @@
+from src.infrastructure.database.chromadb.conector import ChromaDB
+from src.services.docmuent_extration.extractor import DocumentFileReader
+from typing import List, Dict
+
+
+async def add_documents(
+    file_path: str,
+    document_content: str,
+    collection_name: str
+) -> Dict:
+    db = ChromaDB()
+    
+    reader = DocumentFileReader(file_path, document_content)
+    result = await db.add_documents(
+        documents=reader.documents,
+        collection_name=collection_name,
+        metadatas=reader.metadata,
+        ids=reader.ids
+    )
+    return result

From a23ed1e9b1e72704752f6bd772e80749e867bb3f Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Wed, 19 Feb 2025 18:07:12 -0300
Subject: [PATCH 10/15] [feat] add document management in inits

---
 src/api/controllers/__init__.py | 3 ++-
 src/api/models/__init__.py      | 3 ++-
 src/api/routes/__init__.py      | 6 ++++++
 3 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 src/api/routes/__init__.py

diff --git a/src/api/controllers/__init__.py b/src/api/controllers/__init__.py
index 3393a26..2f18c19 100644
--- a/src/api/controllers/__init__.py
+++ b/src/api/controllers/__init__.py
@@ -1,4 +1,5 @@
 from .chat import new_message as chat_new_message
 from .guardrails import Guardrail
+from .controller_document import add_documents as add_documents
 
-__all__ = ["chat_new_message", "Guardrail"]
+__all__ = ["chat_new_message", "add_documents", "Guardrail"]
diff --git a/src/api/models/__init__.py b/src/api/models/__init__.py
index f19d09f..aa52eab 100644
--- a/src/api/models/__init__.py
+++ b/src/api/models/__init__.py
@@ -1,3 +1,4 @@
 from .api import APIResponse, APIRequest
+from .documents_request import DocumentRequest  
 
-__all__ = ["APIResponse", "APIRequest"]
+__all__ = ["APIResponse", "APIRequest", "DocumentRequest"]
diff --git a/src/api/routes/__init__.py b/src/api/routes/__init__.py
new file mode 100644
index 0000000..b1efb50
--- /dev/null
+++ b/src/api/routes/__init__.py
@@ -0,0 +1,6 @@
+from src.api.routes.route_document import router as document_router
+from src.api.routes.chat import router as chat_router
+
+
+__all__ = ["chat_router", "document_router"]
+

From 071181a39bedb60161a221ad33681345b0a0bbd7 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Thu, 20 Feb 2025 21:30:49 -0300
Subject: [PATCH 11/15] [feat] add delete and querry route

---
 src/api/controllers/controller_document.py | 29 +++++++++++++-
 src/api/models/__init__.py                 |  4 +-
 src/api/models/documents_request.py        | 13 +++++-
 src/api/routes/route_document.py           | 46 ++++++++++++++++++++--
 4 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py
index 8bd022f..3204647 100644
--- a/src/api/controllers/controller_document.py
+++ b/src/api/controllers/controller_document.py
@@ -1,14 +1,14 @@
 from src.infrastructure.database.chromadb.conector import ChromaDB
 from src.services.docmuent_extration.extractor import DocumentFileReader
-from typing import List, Dict
+from typing import List, Dict, Optional
 
 
 async def add_documents(
+    db: ChromaDB,
     file_path: str,
     document_content: str,
     collection_name: str
 ) -> Dict:
-    db = ChromaDB()
     
     reader = DocumentFileReader(file_path, document_content)
     result = await db.add_documents(
@@ -18,3 +18,28 @@ async def add_documents(
         ids=reader.ids
     )
     return result
+
+async def query_documents(
+    db: ChromaDB,
+    query_text: str,
+    collection_name: str,
+    n_results: int = 5,
+    where: Optional[dict] = None
+):
+    result = await db.query_documents(
+        query_text=query_text,
+        collection_name=collection_name,
+        n_results=n_results,
+        where=where) 
+    return result
+
+async def delete_documents(
+    db: ChromaDB,
+    ids: List[str],
+    collection_name: str
+):
+    result = await db.delete_documents(
+        ids=ids,
+        collection_name=collection_name
+    )
+    return result
\ No newline at end of file
diff --git a/src/api/models/__init__.py b/src/api/models/__init__.py
index aa52eab..c8a1b37 100644
--- a/src/api/models/__init__.py
+++ b/src/api/models/__init__.py
@@ -1,4 +1,4 @@
 from .api import APIResponse, APIRequest
-from .documents_request import DocumentRequest  
+from .documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest
 
-__all__ = ["APIResponse", "APIRequest", "DocumentRequest"]
+__all__ = ["APIResponse", "APIRequest", "AddDocumentRequest", "QueryDocumentRequest", "DeleteDocumentRequest"]
diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py
index fb5e6c1..010c6b4 100644
--- a/src/api/models/documents_request.py
+++ b/src/api/models/documents_request.py
@@ -1,6 +1,17 @@
 from pydantic import BaseModel
+from typing import Optional, List, Dict
 
-class DocumentRequest(BaseModel):
+class AddDocumentRequest(BaseModel):
     file_path: str
     document_content: str
+    collection_name: str
+
+class QueryDocumentRequest(BaseModel):
+    query_text: str
+    collection_name: str
+    n_results: Optional[int] = 5
+    where: Optional[Dict] = None
+
+class DeleteDocumentRequest(BaseModel):
+    ids: List[str]
     collection_name: str
\ No newline at end of file
diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py
index 8c8d9dc..af5b205 100644
--- a/src/api/routes/route_document.py
+++ b/src/api/routes/route_document.py
@@ -1,17 +1,19 @@
 from fastapi import APIRouter, status, Request, Depends, HTTPException
 from src.api.models import APIResponse
-from src.api.models.documents_request import DocumentRequest
+from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest
 from typing import Optional, Dict
-from src.api.controllers.controller_document import add_documents
+from src.api.controllers.controller_document import add_documents, query_documents, delete_documents
+
 router = APIRouter(
     prefix="/documents",
     tags=["documents"]
 )
 
 @router.post("/add_documents", status_code=status.HTTP_200_OK)
-async def add_document_route(doc_request: DocumentRequest) -> APIResponse:
+async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> APIResponse:
     try:
         result = await add_documents(
+            req.app.vectordb,
             doc_request.file_path,
             doc_request.document_content,
             doc_request.collection_name
@@ -22,6 +24,44 @@ async def add_document_route(doc_request: DocumentRequest) -> APIResponse:
             response=result
         )
 
+    except Exception as e:
+        return APIResponse(
+            status_code=500,
+            status_message=f"Error detalhado: {str(e)}"
+        )
+    
+@router.post("/query_documents", status_code=status.HTTP_200_OK)
+async def query_document_route(doc_request: QueryDocumentRequest, req: Request) -> APIResponse:
+    try:
+        result = await query_documents(
+            req.app.vectordb,
+            doc_request.query_text,
+            doc_request.collection_name,
+            doc_request.n_results,
+            doc_request.where
+        )
+        return APIResponse(
+            status_code=200,
+            response=result
+        )
+    except Exception as e:
+        return APIResponse(
+            status_code=500,
+            status_message=f"Error detalhado: {str(e)}"
+        )
+
+@router.delete("/delete_documents", status_code=status.HTTP_200_OK)
+async def delete_document_route(doc_request: DeleteDocumentRequest, req: Request) -> APIResponse:
+    try:
+        result = await delete_documents(
+            req.app.vectordb,
+            doc_request.ids,
+            doc_request.collection_name
+        )
+        return APIResponse(
+            status_code=200,
+            response=result
+        )
     except Exception as e:
         return APIResponse(
             status_code=500,

From a22d83cf2352b50c13d5e82a503d379c63d189aa Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Sun, 23 Feb 2025 13:58:36 -0300
Subject: [PATCH 12/15] [feat] add list documents and collections routes

---
 src/api/controllers/controller_document.py    | 22 +++++++++--
 src/api/models/documents_request.py           | 10 ++++-
 src/api/routes/route_document.py              | 37 +++++++++++++++++--
 .../database/chromadb/conector.py             | 17 ++++++++-
 src/main.py                                   |  3 +-
 5 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py
index 3204647..63ccccd 100644
--- a/src/api/controllers/controller_document.py
+++ b/src/api/controllers/controller_document.py
@@ -6,11 +6,12 @@
 async def add_documents(
     db: ChromaDB,
     file_path: str,
-    document_content: str,
-    collection_name: str
+    tags_documnets: str,
+    collection_name: str,
+    documnet_id: Optional[List[str]] = None
 ) -> Dict:
     
-    reader = DocumentFileReader(file_path, document_content)
+    reader = DocumentFileReader(file_path, tags_documnets, documnet_id)
     result = await db.add_documents(
         documents=reader.documents,
         collection_name=collection_name,
@@ -42,4 +43,19 @@ async def delete_documents(
         ids=ids,
         collection_name=collection_name
     )
+    return result
+
+async def list_collections(
+    db: ChromaDB
+):
+    result = await db.list_collections()
+    return result
+
+async def list_documents(
+    db: ChromaDB,
+    collection_name: str,
+):
+    result = await db.list_documents(
+        collection_name=collection_name
+    )
     return result
\ No newline at end of file
diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py
index 010c6b4..47f815b 100644
--- a/src/api/models/documents_request.py
+++ b/src/api/models/documents_request.py
@@ -3,7 +3,7 @@
 
 class AddDocumentRequest(BaseModel):
     file_path: str
-    document_content: str
+    tags_documnets: str
     collection_name: str
 
 class QueryDocumentRequest(BaseModel):
@@ -14,4 +14,10 @@ class QueryDocumentRequest(BaseModel):
 
 class DeleteDocumentRequest(BaseModel):
     ids: List[str]
-    collection_name: str
\ No newline at end of file
+    collection_name: str
+
+class ListDocumentsRequest(BaseModel):   
+    collection_name: str
+
+class ListCollectionsRequest(BaseModel):
+    pass
\ No newline at end of file
diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py
index af5b205..74141ee 100644
--- a/src/api/routes/route_document.py
+++ b/src/api/routes/route_document.py
@@ -1,8 +1,8 @@
 from fastapi import APIRouter, status, Request, Depends, HTTPException
 from src.api.models import APIResponse
-from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, DeleteDocumentRequest
+from src.api.models.documents_request import AddDocumentRequest, QueryDocumentRequest, ListDocumentsRequest, ListCollectionsRequest, DeleteDocumentRequest
 from typing import Optional, Dict
-from src.api.controllers.controller_document import add_documents, query_documents, delete_documents
+from src.api.controllers.controller_document import add_documents, query_documents, delete_documents, list_collections, list_documents
 
 router = APIRouter(
     prefix="/documents",
@@ -15,7 +15,7 @@ async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> A
         result = await add_documents(
             req.app.vectordb,
             doc_request.file_path,
-            doc_request.document_content,
+            doc_request.tags_documnets,
             doc_request.collection_name
         )
 
@@ -50,6 +50,37 @@ async def query_document_route(doc_request: QueryDocumentRequest, req: Request)
             status_message=f"Error detalhado: {str(e)}"
         )
 
+@router.get("/list_collections", status_code=status.HTTP_200_OK)
+async def list_collections_route(req: Request):
+    try:
+        result = await list_collections(req.app.vectordb)
+        return {
+            "status_code": 200,
+            "response": result  # Retorna diretamente a lista de nomes das coleções
+        }
+    except Exception as e:
+        return {
+            "status_code": 500,
+            "status_message": f"Erro detalhado: {str(e)}"
+        }
+    
+@router.post("/list_documents", status_code=status.HTTP_200_OK)
+async def list_documents_route(doc_request: ListDocumentsRequest, req: Request) -> APIResponse:
+    try:
+        result = await list_documents(
+            req.app.vectordb,
+            doc_request.collection_name
+        )
+        return APIResponse(
+            status_code=200,
+            response=result
+        )
+    except Exception as e:
+        return APIResponse(
+            status_code=500,
+            status_message=f"Error detalhado: {str(e)}"
+        )
+
 @router.delete("/delete_documents", status_code=status.HTTP_200_OK)
 async def delete_document_route(doc_request: DeleteDocumentRequest, req: Request) -> APIResponse:
     try:
diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py
index 155ff43..1c808b9 100644
--- a/src/infrastructure/database/chromadb/conector.py
+++ b/src/infrastructure/database/chromadb/conector.py
@@ -87,6 +87,21 @@ async def query_documents(
             n_results=n_results,
             where=where
         )
+    
+    async def list_collections(self):
+        """List all collections in ChromaDB.
+
+        Returns:
+            dict: List of collections
+        """
+        return self.client.list_collections()
+    
+    async def list_documents(self, collection_name: str):
+        """List all documents in a
+        collection. Args:   collection_name (str): Name of the collection to list documents from Returns:     dict: List of documents in the collection"""
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.get(include=["documents", "metadatas"])
 
     async def delete_documents(self, ids: List[str],collection_name: str):
         """Delete documents from collection by IDs.
@@ -104,7 +119,7 @@ async def delete_documents(self, ids: List[str],collection_name: str):
             self.collection = self._create_collection(collection_name)
         return self.collection.delete(ids=ids)
 
-    async def close(self):
+    def __del__(self):
         """Close ChromaDB connection and clean up resources."""
         if self.client:
             self.client.reset()
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
index 4e2d5b7..764888d 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,5 +1,5 @@
 from fastapi import FastAPI
-
+from src.infrastructure.database.chromadb.conector import ChromaDB
 from src.infrastructure.database.mongodb.connector import MongoDB
 from src.infrastructure.config.llm import LLM
 from src.services.llama_guard import LlamaGuard
@@ -11,6 +11,7 @@ def create_app():
     app = FastAPI()
 
     # defining API variables
+    app.vectordb = ChromaDB()
     app.database = MongoDB()
     app.llm = LLM()
     app.llama_guard = LlamaGuard()

From 4887e850594745da7189fd1b8158cd5bc85f4244 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Sun, 23 Feb 2025 14:01:06 -0300
Subject: [PATCH 13/15] [feat] update document ID generation method

---
 src/services/docmuent_extration/extractor.py | 28 +++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py
index 7e99bdb..605f16a 100644
--- a/src/services/docmuent_extration/extractor.py
+++ b/src/services/docmuent_extration/extractor.py
@@ -46,31 +46,31 @@ class DocumentFileReader:
         - Arquivos TXT são tratados como um único documento
     """
     
-    def __init__(self, file_path: str, document_content: str):
+    def __init__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None):
         """
         Inicializa o leitor de documentos.
 
         Args:
             file_path (str): Caminho do arquivo a ser lido
-            document_content (str): Tipo de conteúdo do documento (ex: "noticia", "artigo")
+            tags_documnets (str): Tipo de conteúdo do documento (ex: "noticia", "artigo")
 
         Raises:
             FileNotFoundError: Se o arquivo não for encontrado
             ValueError: Se a extensão do arquivo não for suportada
         """
         self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}
-        result = self.__call__(file_path, document_content)
+        result = self.__call__(file_path, tags_documnets, documnet_id)
         self.documents = result.documents
         self.ids = result.ids
         self.metadata = result.metadata
     
-    def __call__(self, file_path: str, document_content: str) -> DocumentResult:
+    def __call__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None) -> DocumentResult:
         """
         Processa o arquivo e retorna os resultados estruturados.
 
         Args:
             file_path (str): Caminho do arquivo a ser lido
-            document_content (str): Tipo de conteúdo do documento
+            tags_documnets (str): Tipo de conteúdo do documento
 
         Returns:
             DocumentResult: Objeto contendo documentos, IDs e metadados
@@ -87,7 +87,6 @@ def __call__(self, file_path: str, document_content: str) -> DocumentResult:
         if path.suffix.lower() not in self.supported_extensions:
             raise ValueError(f"Extensão não suportada: {path.suffix}")
         
-        # Resto do código permanece igual
         readers = {
             '.json': self._read_json,
             '.pdf': self._read_pdf,
@@ -98,11 +97,20 @@ def __call__(self, file_path: str, document_content: str) -> DocumentResult:
         reader = readers.get(path.suffix.lower())
         documents = reader(file_path)
         
-        ids = f"{document_content}"
-        metadata = [{'document_content': document_content} for _ in range(len(documents))]
+        # Gerando IDs únicos
+        if documnet_id is None:
+            ids = [f"{tags_documnets}_{i}" for i in range(len(documents))]
+        else:
+            ids = [documnet_id]
+        metadata = [{'id': doc_id, 'tags_documnets': tags_documnets} for doc_id in ids]
         
-        return DocumentResult(documents=documents, ids=ids, metadata=metadata)
-    
+        # Retornando o DocumentResult
+        return DocumentResult(
+            documents=documents,
+            ids=ids,
+            metadata=metadata
+        )
+
     def _read_json(self, file_path: str) -> List[str]:
         """
         Lê e processa arquivos JSON.

From 0628c8dec4c09f50594def2a4b5f7e7689036035 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Mon, 24 Feb 2025 16:41:50 -0300
Subject: [PATCH 14/15] [bugfix] Fix list function behavior

---
 src/api/controllers/controller_document.py     |  4 ++--
 src/api/models/documents_request.py            |  1 -
 src/api/routes/route_document.py               |  1 -
 .../database/chromadb/conector.py              | 18 +++++++++---------
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/api/controllers/controller_document.py b/src/api/controllers/controller_document.py
index 63ccccd..baa7742 100644
--- a/src/api/controllers/controller_document.py
+++ b/src/api/controllers/controller_document.py
@@ -6,12 +6,12 @@
 async def add_documents(
     db: ChromaDB,
     file_path: str,
-    tags_documnets: str,
     collection_name: str,
+    metadata: Optional[str] = None,
     documnet_id: Optional[List[str]] = None
 ) -> Dict:
     
-    reader = DocumentFileReader(file_path, tags_documnets, documnet_id)
+    reader = DocumentFileReader(file_path, metadata, documnet_id)
     result = await db.add_documents(
         documents=reader.documents,
         collection_name=collection_name,
diff --git a/src/api/models/documents_request.py b/src/api/models/documents_request.py
index 47f815b..c3af766 100644
--- a/src/api/models/documents_request.py
+++ b/src/api/models/documents_request.py
@@ -3,7 +3,6 @@
 
 class AddDocumentRequest(BaseModel):
     file_path: str
-    tags_documnets: str
     collection_name: str
 
 class QueryDocumentRequest(BaseModel):
diff --git a/src/api/routes/route_document.py b/src/api/routes/route_document.py
index 74141ee..32321dd 100644
--- a/src/api/routes/route_document.py
+++ b/src/api/routes/route_document.py
@@ -15,7 +15,6 @@ async def add_document_route(doc_request: AddDocumentRequest, req: Request) -> A
         result = await add_documents(
             req.app.vectordb,
             doc_request.file_path,
-            doc_request.tags_documnets,
             doc_request.collection_name
         )
 
diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py
index 1c808b9..adc20a0 100644
--- a/src/infrastructure/database/chromadb/conector.py
+++ b/src/infrastructure/database/chromadb/conector.py
@@ -18,7 +18,7 @@ def _connect(self):
         client = chromadb.HttpClient(host=self.host, port=self.port)
         return client
 
-    def _create_collection(self, collection_name: str):
+    def _get_or_create_collection(self, collection_name: str):
         """Create or get an existing collection.
 
         Args:
@@ -52,8 +52,8 @@ async def add_documents(
         Returns:
             dict: Result of add operation
         """
-        if not self.collection:
-            self.collection = self._create_collection(collection_name)
+
+        self.collection = self._get_or_create_collection(collection_name)
         return self.collection.add(
             documents=documents,
             metadatas=metadatas,
@@ -80,8 +80,8 @@ async def query_documents(
         Returns:
             dict: Query results
         """
-        if not self.collection:
-            self.collection = self._create_collection(collection_name)
+        
+        self.collection = self._get_or_create_collection(collection_name)
         return self.collection.query(
             query_texts=[query_text],
             n_results=n_results,
@@ -99,8 +99,8 @@ async def list_collections(self):
     async def list_documents(self, collection_name: str):
         """List all documents in a
         collection. Args:   collection_name (str): Name of the collection to list documents from Returns:     dict: List of documents in the collection"""
-        if not self.collection:
-            self.collection = self._create_collection(collection_name)
+
+        self.collection = self._get_or_create_collection(collection_name)
         return self.collection.get(include=["documents", "metadatas"])
 
     async def delete_documents(self, ids: List[str],collection_name: str):
@@ -115,8 +115,8 @@ async def delete_documents(self, ids: List[str],collection_name: str):
         Returns:
             dict: Result of delete operation
         """
-        if not self.collection:
-            self.collection = self._create_collection(collection_name)
+        
+        self.collection = self._get_or_create_collection(collection_name)
         return self.collection.delete(ids=ids)
 
     def __del__(self):

From 9bbe70801e5a12ab01c1f0325ba78b2ebe4592e2 Mon Sep 17 00:00:00 2001
From: Pedro HB Ribeiro <pedro.hbueno03@usp.br>
Date: Mon, 24 Feb 2025 16:43:54 -0300
Subject: [PATCH 15/15] [refactor] Change ID generation logic

---
 src/services/docmuent_extration/extractor.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py
index 605f16a..77ba17f 100644
--- a/src/services/docmuent_extration/extractor.py
+++ b/src/services/docmuent_extration/extractor.py
@@ -1,6 +1,7 @@
 import json
 from typing import List, Dict, Tuple, Optional
 from pathlib import Path
+import uuid
 from docx import Document
 from PyPDF2 import PdfReader
 import re
@@ -46,31 +47,31 @@ class DocumentFileReader:
         - Arquivos TXT são tratados como um único documento
     """
     
-    def __init__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None):
+    def __init__(self, file_path: str, metadata: str, documnet_id: Optional[str] = None):
         """
         Inicializa o leitor de documentos.
 
         Args:
             file_path (str): Caminho do arquivo a ser lido
-            tags_documnets (str): Tipo de conteúdo do documento (ex: "noticia", "artigo")
+            metadata (str): Tipo de conteúdo do documento (ex: "noticia", "artigo")
 
         Raises:
             FileNotFoundError: Se o arquivo não for encontrado
             ValueError: Se a extensão do arquivo não for suportada
         """
         self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}
-        result = self.__call__(file_path, tags_documnets, documnet_id)
+        result = self.__call__(file_path, metadata, documnet_id)
         self.documents = result.documents
         self.ids = result.ids
         self.metadata = result.metadata
     
-    def __call__(self, file_path: str, tags_documnets: str, documnet_id: Optional[str] = None) -> DocumentResult:
+    def __call__(self, file_path: str, metadata: Optional[str] = None, documnet_id: Optional[str] = None) -> DocumentResult:
         """
         Processa o arquivo e retorna os resultados estruturados.
 
         Args:
             file_path (str): Caminho do arquivo a ser lido
-            tags_documnets (str): Tipo de conteúdo do documento
+            metadata (str): Tipo de conteúdo do documento
 
         Returns:
             DocumentResult: Objeto contendo documentos, IDs e metadados
@@ -99,10 +100,9 @@ def __call__(self, file_path: str, tags_documnets: str, documnet_id: Optional[st
         
         # Gerando IDs únicos
         if documnet_id is None:
-            ids = [f"{tags_documnets}_{i}" for i in range(len(documents))]
+            ids=[str(uuid.uuid4()) for _ in enumerate(documents)]
         else:
             ids = [documnet_id]
-        metadata = [{'id': doc_id, 'tags_documnets': tags_documnets} for doc_id in ids]
         
         # Retornando o DocumentResult
         return DocumentResult(