-
Notifications
You must be signed in to change notification settings - Fork 1
/
page.py
34 lines (31 loc) · 1.31 KB
/
page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from typing import List
from .document_chunking_base import DocumentChunkingBase
from langchain.text_splitter import MarkdownTextSplitter
from .chunking_strategy import ChunkingSettings
from ..common.source_document import SourceDocument
class PageDocumentChunking(DocumentChunkingBase):
def __init__(self) -> None:
pass
def chunk(
self, documents: List[SourceDocument], chunking: ChunkingSettings
) -> List[SourceDocument]:
document_url = documents[0].source
splitter = MarkdownTextSplitter.from_tiktoken_encoder(
chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap
)
documents_chunked = []
for idx, document in enumerate(documents):
chunked_content_list = splitter.split_text(document.content)
for chunked_content in chunked_content_list:
documents_chunked.append(
SourceDocument.from_metadata(
content=chunked_content,
document_url=document_url,
metadata={
"offset": document.offset,
"page_number": document.page_number,
},
idx=idx,
)
)
return documents_chunked