Skip to content

Commit 02e33f7

Browse files
authored
Feat/file parser debug (#572)
* feat: update file_content_parser fine * feat: add inner host * feat: add default inner reader ip * refactor: modify file_content_parser * feat: pass through parse when md/txt * feat: add text spliter and parser * feat: add default spliter
1 parent c590f3f commit 02e33f7

File tree

7 files changed

+592
-44
lines changed

7 files changed

+592
-44
lines changed

examples/mem_reader/multimodal_struct_reader.py

Lines changed: 105 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,102 @@ def get_info(self) -> dict[str, Any]:
327327
]
328328
],
329329
),
330+
TestCase(
331+
name="oss_text_file",
332+
description="User message with text and file",
333+
scene_data=[
334+
[
335+
{
336+
"role": "user",
337+
"content": [
338+
{"type": "text", "text": "请阅读这个PDF,总结里面的要点。"},
339+
{
340+
"type": "file",
341+
"file": {
342+
"file_id": "file_123",
343+
"filename": "report.pdf",
344+
"file_data": "@http://139.196.232.20:9090/graph-test/algorithm/2025_11_13/1763043889_1763043782_PM1%E8%BD%A6%E9%97%B4PMT%E9%9D%B4%E5%8E%8B%E8%BE%B9%E5%8E%8B%E5%8E%8B%E5%8A%9B%E6%97%A0%E6%B3%95%E5%BB%BA%E7%AB%8B%E6%95%85%E9%9A%9C%E6%8A%A5%E5%91%8A20240720.md",
345+
},
346+
},
347+
],
348+
"chat_time": "2025-11-24T10:21:00Z",
349+
"message_id": "mm-file-1",
350+
}
351+
]
352+
],
353+
),
354+
TestCase(
355+
name="pure_data_file",
356+
description="User message with text and file",
357+
scene_data=[
358+
[
359+
{
360+
"role": "user",
361+
"content": [
362+
{"type": "text", "text": "请阅读这个PDF,总结里面的要点。"},
363+
{
364+
"type": "file",
365+
"file": {
366+
"file_id": "file_123",
367+
"filename": "report.pdf",
368+
"file_data": "明文记忆是系统与用户对话、操作等交互中动态习得,以及外部提供的、可显式管理的结构化知识形态,通常以文档、提示模板、图结构或用户规则等形式存在。它具备编辑性、可共享性与治理友好性,适合存储需要频繁修改、可审计或多方协同使用的信息。 在 MemOS 中,明文记忆可用于动态生成推理上下文、个性化偏好注入、多代理协作共享等场景,成为连接人类输入与模型认知的关键桥梁。激活记忆是指模型在推理过程中产生的瞬时性认知状态,包括 KV cache、隐藏层激活、注意力权重等中间张量结构。它通常用于维持上下文连续性、对话一致性与行为风格控制。 MemOS 将激活记忆抽象为可调度资源,支持按需唤醒、延迟卸载与结构变换。例如,某些上下文状态可以被压缩为“半结构化记忆片段”用于未来复用,也可以在任务级别转化为参数化模块,支持短期记忆的长期化演进。这一机制为模型行为一致性、风格保持与状态持续性提供了基础。",
369+
},
370+
},
371+
],
372+
"chat_time": "2025-11-24T10:21:00Z",
373+
"message_id": "mm-file-1",
374+
}
375+
]
376+
],
377+
),
378+
TestCase(
379+
name="local_data_file",
380+
description="User message with text and file",
381+
scene_data=[
382+
[
383+
{
384+
"role": "user",
385+
"content": [
386+
{"type": "text", "text": "请阅读这个PDF,总结里面的要点。"},
387+
{
388+
"type": "file",
389+
"file": {
390+
"file_id": "file_123",
391+
"filename": "report.pdf",
392+
"file_data": "./my_local_file/report.pdf",
393+
},
394+
},
395+
],
396+
"chat_time": "2025-11-24T10:21:00Z",
397+
"message_id": "mm-file-1",
398+
}
399+
]
400+
],
401+
),
402+
TestCase(
403+
name="internet_file",
404+
description="User message with text and file",
405+
scene_data=[
406+
[
407+
{
408+
"role": "user",
409+
"content": [
410+
{"type": "text", "text": "请阅读这个PDF,总结里面的要点。"},
411+
{
412+
"type": "file",
413+
"file": {
414+
"file_id": "file_123",
415+
"filename": "report.pdf",
416+
"file_data": "https://upload.wikimedia.org/wikipedia/commons/c/cb/NLC416-16jh004830-88775_%E7%B4%85%E6%A8%93%E5%A4%A2.pdf",
417+
},
418+
},
419+
],
420+
"chat_time": "2025-11-24T10:21:00Z",
421+
"message_id": "mm-file-1",
422+
}
423+
]
424+
],
425+
),
330426
TestCase(
331427
name="multimodal_mixed",
332428
description="Mixed multimodal message (text + file + image)",
@@ -661,6 +757,12 @@ def get_reader_config() -> dict[str, Any]:
661757
},
662758
}
663759

760+
# Get direct markdown hostnames from environment variable
761+
direct_markdown_hostnames = None
762+
env_hostnames = os.getenv("FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "139.196.232.20")
763+
if env_hostnames:
764+
direct_markdown_hostnames = [h.strip() for h in env_hostnames.split(",") if h.strip()]
765+
664766
return {
665767
"llm": llm_config,
666768
"embedder": embedder_config,
@@ -673,6 +775,7 @@ def get_reader_config() -> dict[str, Any]:
673775
"min_sentences_per_chunk": 1,
674776
},
675777
},
778+
"direct_markdown_hostnames": direct_markdown_hostnames,
676779
}
677780

678781

@@ -863,13 +966,13 @@ def main():
863966
parser.add_argument(
864967
"--example",
865968
type=str,
866-
default="all",
969+
default="oss_text_file",
867970
help="Test case name, category name, or 'all' to run all cases (default: all)",
868971
)
869972
parser.add_argument(
870973
"--mode",
871974
choices=["fast", "fine"],
872-
default="fast",
975+
default="fine",
873976
help="Processing mode: fast (quick) or fine (with LLM) (default: fast)",
874977
)
875978
parser.add_argument(

src/memos/api/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,13 @@ def get_product_default_config() -> dict[str, Any]:
707707
},
708708
},
709709
"chat_chunker": reader_config,
710+
"direct_markdown_hostnames": [
711+
h.strip()
712+
for h in os.getenv(
713+
"FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES", "139.196.232.20"
714+
).split(",")
715+
if h.strip()
716+
],
710717
},
711718
},
712719
"enable_textual_memory": True,

src/memos/configs/mem_reader.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ class SimpleStructMemReaderConfig(BaseMemReaderConfig):
4848
class MultiModalStructMemReaderConfig(BaseMemReaderConfig):
4949
"""MultiModalStruct MemReader configuration class."""
5050

51+
direct_markdown_hostnames: list[str] | None = Field(
52+
default=None,
53+
description="List of hostnames that should return markdown directly without parsing. "
54+
"If None, reads from FILE_PARSER_DIRECT_MARKDOWN_HOSTNAMES environment variable.",
55+
)
56+
5157

5258
class StrategyStructMemReaderConfig(BaseMemReaderConfig):
5359
"""StrategyStruct MemReader configuration class."""

src/memos/mem_reader/multi_modal_struct.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,13 @@ def __init__(self, config: MultiModalStructMemReaderConfig):
2929
"""
3030
from memos.configs.mem_reader import SimpleStructMemReaderConfig
3131

32+
# Extract direct_markdown_hostnames before converting to SimpleStructMemReaderConfig
33+
direct_markdown_hostnames = getattr(config, "direct_markdown_hostnames", None)
34+
35+
# Create config_dict excluding direct_markdown_hostnames for SimpleStructMemReaderConfig
3236
config_dict = config.model_dump(exclude_none=True)
37+
config_dict.pop("direct_markdown_hostnames", None)
38+
3339
simple_config = SimpleStructMemReaderConfig(**config_dict)
3440
super().__init__(simple_config)
3541

@@ -38,6 +44,7 @@ def __init__(self, config: MultiModalStructMemReaderConfig):
3844
embedder=self.embedder,
3945
llm=self.llm,
4046
parser=None,
47+
direct_markdown_hostnames=direct_markdown_hostnames,
4148
)
4249

4350
def _concat_multi_modal_memories(
@@ -271,7 +278,7 @@ def _process_multi_modal_data(
271278
sources = fast_item.metadata.sources
272279
for source in sources:
273280
items = self.multi_modal_parser.process_transfer(
274-
source, context_items=[fast_item], custom_tags=custom_tags
281+
source, context_items=[fast_item], custom_tags=custom_tags, info=info
275282
)
276283
fine_memory_items.extend(items)
277284
return fine_memory_items

0 commit comments

Comments
 (0)