@@ -27,22 +27,6 @@ class PageSummaryEnhancer(SummaryEnhancer):
2727 BASE64_IMAGE_KEY = "base64_image"
2828 DEFAULT_PAGE_NR = 1
2929
30- async def _acreate_summary (self , information : list [Document ], config : Optional [RunnableConfig ]) -> list [Document ]:
31- # group infos by page, defaulting to page 1 if no page metadata
32- if self ._chunker_settings :
33- filtered_information = [
34- info for info in information if len (info .page_content ) > self ._chunker_settings .max_size
35- ]
36- else :
37- filtered_information = information
38- grouped = [
39- [info for info in filtered_information if info .metadata .get ("page" , self .DEFAULT_PAGE_NR ) == page ]
40- for page in {info_piece .metadata .get ("page" , self .DEFAULT_PAGE_NR ) for info_piece in filtered_information }
41- ]
42-
43- summary_tasks = [self ._asummarize_page (info_group , config ) for info_group in tqdm (grouped )]
44- return await gather (* summary_tasks )
45-
4630 async def _asummarize_page (self , page_pieces : list [Document ], config : Optional [RunnableConfig ]) -> Document :
4731 full_page_content = " " .join ([piece .page_content for piece in page_pieces ])
4832 summary = await self ._summarizer .ainvoke (full_page_content , config )
@@ -52,3 +36,26 @@ async def _asummarize_page(self, page_pieces: list[Document], config: Optional[R
5236 meta ["type" ] = ContentType .SUMMARY .value
5337
5438 return Document (metadata = meta , page_content = summary )
39+
40+ async def _acreate_summary (self , information : list [Document ], config : Optional [RunnableConfig ]) -> list [Document ]:
41+ distinct_pages = []
42+ for info in information :
43+ if info .metadata .get ("page" , self .DEFAULT_PAGE_NR ) not in distinct_pages :
44+ distinct_pages .append (info .metadata .get ("page" , self .DEFAULT_PAGE_NR ))
45+
46+ grouped = []
47+ for page in distinct_pages :
48+ group = []
49+ for compare_info in information :
50+ if compare_info .metadata .get ("page" , self .DEFAULT_PAGE_NR ) == page :
51+ group .append (compare_info )
52+ if (
53+ self ._chunker_settings
54+ and len (" " .join ([item .page_content for item in group ])) < self ._chunker_settings .max_size
55+ ):
56+ continue
57+ grouped .append (group )
58+
59+ summary_tasks = [self ._asummarize_page (info_group , config ) for info_group in tqdm (grouped )]
60+
61+ return await gather (* summary_tasks )
0 commit comments