feat: add doc cli tool and refine document processing (v1.0.9)

2026-05-17 11:53:26 +08:00 · 2026-05-17 11:53:26 +08:00 · 7d7dfd78e5
commit 7d7dfd78e5
parent 34e786b3d8
8 changed files with 670 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ env.json
 env.yml
 env.yaml
 .log.meta.json
+/test_res/
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,17 @@
 # CHANGELOG

+## v1.0.9 (2026-05-17)
+- **新特性**: 内置功能完整的命令行工具 `doc` (`cmd/doc`)。
+- **功能增强**: `doc` 支持创建、打开、查看预览（Markdown）、转换为 JSON、查看元数据（Inspect）以及数据注入（Excel）。
+- **文档优化**: README 增加 `doc` 命令行工具的安装与使用指南。
+
+## v1.0.8 (2026-05-15)
+- **基础设施对齐**: 同步更新 `go/cast` 和 `go/file` 至最新版本。
+- **功能修复**: 修复 Excel 单元格解析在某些边界情况下的偏差。
+
+## v1.0.7 (2026-05-14)
+- **依赖同步**: 对齐基础设施版本。
+
 ## v1.0.6 (2026-05-13)
 - **新特性**: 支持 `.csv` 格式，支持对象数组与 Markdown 表格转换。
 - **新特性**: 支持 `.md` 和 `.txt` 格式，统一纳入 `Document` 接口管理。
--- a/README.md
+++ b/README.md
@ -72,6 +72,37 @@ mdStr := g.ToMarkdown() // 包含 Mermaid graph TD 的渲染内容
 - `ToMarkdown() string`
 - `Save(filename ...string) error`

+## 命令行工具 (doc)
+
+`document` 包内置了一个强大的命令行工具 `doc`，位于 `cmd/doc` 目录下。
+
+### 安装
+使用 `go install` 安装，生成的二进制文件名即为 `doc`：
+```bash
+go install apigo.cc/go/document/cmd/doc@latest
+```
+
+### 常用命令
+```bash
+# 1. 预览 Excel/Word/PDF 内容 (默认输出 Markdown)
+doc report.xlsx
+
+# 2. 将文档转为结构化 JSON (适合 RAG 或自动化脚本)
+doc manual.docx --json
+
+# 3. 提取 PDF 内容并保存为 Markdown 文件
+doc paper.pdf -o paper.md
+
+# 4. 向已有的 Excel 注入数据 (支持追加或覆盖)
+doc --data '[{"id":1,"name":"Alice"}]' -o test.xlsx test.xlsx
+
+# 5. 查看文档元数据 (如工作表名、PDF 页数等)
+doc paper.pdf --inspect
+```
+
+### 帮助信息
+运行 `doc --help` 查看完整参数说明。
+
 ### Graph 专用 (关系型文档)
 - `AddNode(n *Node)`
 - `OpenGraph(filename string) (*Graph, error)`
--- a/cmd/doc/main.go
+++ b/cmd/doc/main.go
@ -0,0 +1,159 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+
+	"apigo.cc/go/cast"
+	"apigo.cc/go/document"
+)
+
+var (
+	jsonOut    = flag.Bool("json", false, "以 JSON 格式输出文档内容")
+	mdOut      = flag.Bool("md", false, "以 Markdown 格式输出文档内容 (默认模式)")
+	savePath   = flag.String("o", "", "保存结果到指定文件路径 (如: output.xlsx, content.md)")
+	createType = flag.String("create", "", "创建新文档，支持类型: xlsx, csv, graph, md")
+	password   = flag.String("password", "", "访问加密文档所需的密码 (主要针对 Excel)")
+	sheetName  = flag.String("sheet", "", "操作 Excel 时指定的工作表名称或索引 (0, 1...)")
+	dataStr    = flag.String("data", "", "注入数据的 JSON 字符串 (支持对象数组或单个对象)")
+	inspect    = flag.Bool("inspect", false, "只查看文档元数据 (如类型、页数、工作表列表等)")
+	version    = flag.Bool("v", false, "显示版本信息")
+)
+
+const docVersion = "1.0.0"
+
+func main() {
+	flag.Usage = func() {
+		fmt.Fprintf(os.Stderr, "🗂️  Document CLI (doc) - 极简办公文档处理工具 v%s\n\n", docVersion)
+		fmt.Fprintf(os.Stderr, "用法:\n")
+		fmt.Fprintf(os.Stderr, "  doc [flags] [file]          # 处理已有文件\n")
+		fmt.Fprintf(os.Stderr, "  doc --create [type] [flags]  # 创建新文档\n\n")
+
+		fmt.Fprintf(os.Stderr, "常见示例:\n")
+		fmt.Fprintf(os.Stderr, "  doc report.xlsx             # 预览 Excel 内容 (Markdown 表格)\n")
+		fmt.Fprintf(os.Stderr, "  doc manual.docx --json      # 提取 Word 内容为结构化 JSON\n")
+		fmt.Fprintf(os.Stderr, "  doc paper.pdf -o text.md    # 提取 PDF 文字并存为 Markdown\n")
+		fmt.Fprintf(os.Stderr, "  doc --create xlsx -o n.xlsx # 创建空白 Excel\n")
+		fmt.Fprintf(os.Stderr, "  doc test.xlsx --data '[{\"ID\":1}]' -o test.xlsx  # 向 Excel 追加数据\n\n")
+
+		fmt.Fprintf(os.Stderr, "参数详解:\n")
+		flag.PrintDefaults()
+		
+		fmt.Fprintf(os.Stderr, "\n支持的格式:\n")
+		fmt.Fprintf(os.Stderr, "  Excel (.xlsx), Word (.docx), PDF (.pdf), PPT (.pptx), CSV (.csv), Graph (.graph), Markdown (.md)\n")
+	}
+
+	flag.Parse()
+
+	if *version {
+		fmt.Printf("doc version %s\n", docVersion)
+		return
+	}
+
+	args := flag.Args()
+	var doc document.Document
+	var err error
+
+	// 1. 获取文档实例
+	if *createType != "" {
+		doc, err = document.Create(*createType)
+		if err != nil {
+			fail("创建文档失败: %v", err)
+		}
+	} else if len(args) > 0 {
+		filename := args[0]
+		if *password != "" {
+			doc, err = document.Open(filename, *password)
+		} else {
+			doc, err = document.Open(filename)
+		}
+		if err != nil {
+			fail("无法打开文件 '%s': %v", filename, err)
+		}
+	} else {
+		flag.Usage()
+		return
+	}
+
+	// 2. 数据注入逻辑
+	if *dataStr != "" {
+		applyData(doc, *dataStr, *sheetName)
+	}
+
+	// 3. 执行核心操作
+	if *inspect {
+		runInspect(doc)
+		return
+	}
+
+	if *savePath != "" {
+		if err := doc.Save(*savePath); err != nil {
+			fail("保存失败: %v", err)
+		}
+		fmt.Printf("✨ 成功保存至: %s\n", *savePath)
+	} else {
+		outputContent(doc, *jsonOut)
+	}
+}
+
+func applyData(doc document.Document, dataStr, sheet string) {
+	var data []map[string]any
+	if err := cast.UnmarshalJSON(dataStr, &data); err != nil {
+		var single map[string]any
+		if err2 := cast.UnmarshalJSON(dataStr, &single); err2 == nil {
+			data = []map[string]any{single}
+		} else {
+			fail("数据格式无效，请提供有效的 JSON 对象或数组: %v", err)
+		}
+	}
+
+	switch d := doc.(type) {
+	case *document.Excel:
+		if err := d.SetData(sheet, data, "A1", ""); err != nil {
+			fail("写入 Excel 失败: %v", err)
+		}
+	case *document.Graph:
+		fmt.Println("⚠️  提示: Graph 类型目前主要通过 API 操作，暂不支持通过 CLI 批量 SetData。")
+	default:
+		fmt.Printf("⚠️  警告: 当前文档类型 (%T) 不支持数据注入操作。\n", d)
+	}
+}
+
+func runInspect(doc document.Document) {
+	fmt.Printf("🔍 文档详情:\n")
+	fmt.Printf("  类型: %T\n", doc)
+	
+	switch d := doc.(type) {
+	case *document.Excel:
+		fmt.Printf("  工作表: %s\n", strings.Join(d.Sheets(), ", "))
+	case *document.PDF:
+		if pages, ok := d.Metadata["pages"]; ok {
+			fmt.Printf("  总页数: %v\n", pages)
+		}
+		for k, v := range d.Metadata {
+			if k != "pages" {
+				fmt.Printf("  %s: %v\n", k, v)
+			}
+		}
+	}
+}
+
+func outputContent(doc document.Document, asJSON bool) {
+	if asJSON {
+		fmt.Println(doc.ToJSON())
+	} else {
+		content := doc.ToMarkdown()
+		if content == "" {
+			fmt.Println("(文档内容为空)")
+		} else {
+			fmt.Println(content)
+		}
+	}
+}
+
+func fail(format string, a ...any) {
+	fmt.Fprintf(os.Stderr, "❌ 错误: "+format+"\n", a...)
+	os.Exit(1)
+}
--- a/docx.go
+++ b/docx.go
@ -45,7 +45,7 @@ func OpenDocx(filename string) (*Docx, error) {
 	return d, nil
 }

-// extractMarkdown 尝试从 docx 的 XML 中提取带标题的 Markdown。
+// extractMarkdown 尝试从 docx 的 XML 中提取带标题和表格的 Markdown。
 func (d *Docx) extractMarkdown(filename string) (string, error) {
 	r, err := zip.OpenReader(filename)
 	if err != nil {
@ -69,6 +69,10 @@ func (d *Docx) extractMarkdown(filename string) (string, error) {
 	var sb strings.Builder
 	var inT bool
 	var currentStyle string
+	var inTable bool
+	var tableRows [][]string
+	var currentRow []string
+	var cellText strings.Builder

 	for {
 		t, err := decoder.Token()
@ -81,29 +85,56 @@ func (d *Docx) extractMarkdown(filename string) (string, error) {

 		switch se := t.(type) {
 		case xml.StartElement:
-			if se.Name.Local == "p" {
+			switch se.Name.Local {
+			case "p":
 				currentStyle = ""
-			} else if se.Name.Local == "pStyle" {
+				cellText.Reset()
+			case "pStyle":
 				for _, attr := range se.Attr {
 					if attr.Name.Local == "val" {
 						currentStyle = attr.Value
 					}
 				}
-			} else if se.Name.Local == "t" {
+			case "t":
 				inT = true
+			case "tbl":
+				inTable = true
+				tableRows = nil
+			case "tr":
+				currentRow = nil
+			case "tc":
+				cellText.Reset()
 			}
 		case xml.EndElement:
-			if se.Name.Local == "p" {
-				sb.WriteString("\n")
-			} else if se.Name.Local == "t" {
+			switch se.Name.Local {
+			case "p":
+				if inTable {
+					// Paragraph inside table cell is handled by cellEnd
+				} else {
+					sb.WriteString("\n")
+				}
+			case "t":
 				inT = false
+			case "tc":
+				currentRow = append(currentRow, strings.TrimSpace(cellText.String()))
+				cellText.Reset()
+			case "tr":
+				tableRows = append(tableRows, currentRow)
+			case "tbl":
+				inTable = false
+				sb.WriteString(renderMarkdownTable(tableRows))
+				sb.WriteString("\n")
 			}
 		case xml.CharData:
-			if inT {
-				text := string(se)
-				if strings.Contains(strings.ToLower(currentStyle), "heading") {
+			text := string(se)
+			if inTable {
+				cellText.WriteString(text)
+			} else if inT {
+				if strings.Contains(strings.ToLower(currentStyle), "heading") || 
+				   strings.Contains(strings.ToLower(currentStyle), "title") ||
+				   strings.Contains(strings.ToLower(currentStyle), "subject") {
 					level := "1"
-					if len(currentStyle) > 7 {
+					if strings.Contains(strings.ToLower(currentStyle), "heading") && len(currentStyle) > 7 {
 						level = currentStyle[7:]
 					}
 					l := cast.To[int](level)
@ -122,6 +153,30 @@ func (d *Docx) extractMarkdown(filename string) (string, error) {
 	return strings.TrimSpace(sb.String()), nil
 }

+func renderMarkdownTable(rows [][]string) string {
+	if len(rows) == 0 {
+		return ""
+	}
+	var sb strings.Builder
+	sb.WriteString("\n")
+	for i, row := range rows {
+		sb.WriteString("| ")
+		for _, col := range row {
+			sb.WriteString(strings.ReplaceAll(col, "|", "\\|"))
+			sb.WriteString(" | ")
+		}
+		sb.WriteString("\n")
+		if i == 0 {
+			sb.WriteString("|")
+			for range row {
+				sb.WriteString(" --- |")
+			}
+			sb.WriteString("\n")
+		}
+	}
+	return sb.String()
+}
+
 // ToJSON 返回包含元数据和内容的 JSON 字符串。
 func (d *Docx) ToJSON() string {
 	res, _ := cast.ToJSON(map[string]any{
--- a/excel.go
+++ b/excel.go
@ -87,7 +87,7 @@ func (xls *Excel) ToMarkdown() string {
 		}

 		if len(sheets) > 1 {
-			sb.WriteString("## Sheet: " + sheetName + "\n\n")
+			sb.WriteString("# Sheet: " + sheetName + "\n\n")
 		}

 		for i, row := range rows {
--- a/pdf.go
+++ b/pdf.go
@ -2,6 +2,7 @@ package document

 import (
 	"fmt"
+	"sort"
 	"strings"

 	"apigo.cc/go/cast"
@ -25,42 +26,371 @@ func OpenPDF(filename string) (*PDF, error) {
 		filename: filename,
 		Metadata: make(map[string]any),
 	}
-	
+
 	f, err := pdf.Open(filename)
-	if err == nil {
-		p.Metadata["pages"] = f.NumPage()
-		trailer := f.Trailer()
-		infoDict := trailer.Key("Info")
-		if !infoDict.IsNull() {
-			for _, field := range infoDict.Keys() {
-				val := infoDict.Key(field).Text()
-				if val != "" {
-					p.Metadata[strings.ToLower(field)] = val
-				}
+	if err != nil {
+		return nil, err
+	}
+
+	p.Metadata["pages"] = f.NumPage()
+	trailer := f.Trailer()
+	infoDict := trailer.Key("Info")
+	if !infoDict.IsNull() {
+		for _, field := range infoDict.Keys() {
+			val := infoDict.Key(field).Text()
+			if val != "" {
+				p.Metadata[strings.ToLower(field)] = val
 			}
 		}
-		
+	}
+
+	var sb strings.Builder
+	for i := 1; i <= f.NumPage(); i++ {
+		page := f.Page(i)
+		if page.V.IsNull() {
+			continue
+		}
+
+		content := page.Content()
+		texts := content.Text
+		if len(texts) == 0 {
+			continue
+		}
+
+		// 处理页面内容
+		sb.WriteString(p.processPageTexts(texts))
+	}
+	p.Content = strings.TrimSpace(sb.String())
+
+	return p, nil
+}
+
+func (p *PDF) processPageTexts(texts []pdf.Text) string {
+	if len(texts) == 0 {
+		return ""
+	}
+
+	// 1. 估算正文字体大小（众数）
+	fontSizes := make(map[int]int)
+	for _, t := range texts {
+		fontSizes[int(t.FontSize)]++
+	}
+	bodySize := 0
+	maxCount := 0
+	for size, count := range fontSizes {
+		if count > maxCount {
+			maxCount = count
+			bodySize = size
+		}
+	}
+
+	// 2. 按行分组（基于 Y 坐标）
+	type Line struct {
+		Y           float64
+		MaxFontSize float64
+		Text        string
+		Texts       []pdf.Text
+	}
+	var lines []Line
+	for _, t := range texts {
+		found := false
+		for i := range lines {
+			if t.Y > lines[i].Y-2 && t.Y < lines[i].Y+2 {
+				lines[i].Texts = append(lines[i].Texts, t)
+				found = true
+				break
+			}
+		}
+		if !found {
+			lines = append(lines, Line{Y: t.Y, Texts: []pdf.Text{t}})
+		}
+	}
+
+	// 按 Y 降序排列（从上到下）
+	sort.Slice(lines, func(i, j int) bool {
+		return lines[i].Y > lines[j].Y
+	})
+
+	// 预处理每一行的文本和最大字体
+	for i := range lines {
+		sort.Slice(lines[i].Texts, func(m, n int) bool {
+			return lines[i].Texts[m].X < lines[i].Texts[n].X
+		})
 		var sb strings.Builder
-		for i := 1; i <= f.NumPage(); i++ {
-			p_ := f.Page(i)
-			if p_.V.IsNull() {
+		maxFS := 0.0
+		for _, t := range lines[i].Texts {
+			sb.WriteString(t.S)
+			if t.FontSize > maxFS {
+				maxFS = t.FontSize
+			}
+		}
+		lines[i].Text = strings.TrimSpace(sb.String())
+		lines[i].MaxFontSize = maxFS
+	}
+
+	// 3. 语义块识别
+	type Block struct {
+		Type     string // heading, paragraph, table
+		Level    int    // for heading
+		Text     string
+		FontSize float64
+	}
+	var blocks []Block
+	for i := 0; i < len(lines); i++ {
+		line := lines[i]
+		if line.Text == "" {
+			continue
+		}
+
+		// 表格识别逻辑
+		isTableLine, cells := p.identifyTableLine(line)
+		if isTableLine {
+			tableStr := "| " + strings.Join(cells, " | ") + " |"
+			blocks = append(blocks, Block{Type: "table", Text: tableStr})
+			continue
+		}
+
+		// 标题识别逻辑 (比正文大)
+		if line.MaxFontSize > float64(bodySize)+1 {
+			level := 1
+			if line.MaxFontSize < float64(bodySize)+4 {
+				level = 3
+			} else if line.MaxFontSize < float64(bodySize)+8 {
+				level = 2
+			}
+
+			// 合并紧随其后的同字体行（处理跨行标题）
+			fullText := line.Text
+			lastY := line.Y
+			for j := i + 1; j < len(lines); j++ {
+				if lines[j].Text != "" && MathAbs(lines[j].MaxFontSize-line.MaxFontSize) < 1.0 {
+					if MathAbs(lines[j].Y-lastY) < 25 {
+						fullText += " " + lines[j].Text
+						lastY = lines[j].Y
+						i = j
+					} else {
+						break
+					}
+				} else {
+					break
+				}
+			}
+
+			// 检查是否重复
+			if i < 5 {
+				if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
+					continue
+				}
+			}
+
+			// 合并同级标题
+			if len(blocks) > 0 && blocks[len(blocks)-1].Type == "heading" && blocks[len(blocks)-1].Level == level {
+				blocks[len(blocks)-1].Text += " " + fullText
+			} else {
+				blocks = append(blocks, Block{Type: "heading", Level: level, Text: fullText})
+			}
+		} else {
+			// 跳过页码
+			if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
 				continue
 			}
-			t := p_.Content().Text
-			if len(t) > 0 {
-				if i > 1 {
-					sb.WriteString("\n\n")
-				}
-				sb.WriteString(fmt.Sprintf("<!-- Page %d -->\n", i))
-				for _, text := range t {
-					sb.WriteString(text.S)
+
+			// 段落识别
+			fullText := line.Text
+			lastY := line.Y
+			for j := i + 1; j < len(lines); j++ {
+				if lines[j].Text != "" && lines[j].MaxFontSize <= float64(bodySize)+1 {
+					isT, _ := p.identifyTableLine(lines[j])
+					if isT {
+						break
+					}
+					if (lines[j].Y < 50 || lines[j].Y > 800) && isPageNumber(lines[j].Text) {
+						break
+					}
+					if MathAbs(lines[j].Y-lastY) > 25 {
+						break
+					}
+
+					fullText += lines[j].Text
+					lastY = lines[j].Y
+					i = j
+
+					if isPunctuation(lines[j].Text) {
+						break
+					}
+				} else {
+					break
 				}
 			}
+			blocks = append(blocks, Block{Type: "paragraph", Text: fullText})
 		}
-		p.Content = strings.TrimSpace(sb.String())
 	}
+
+	// 4. 智能封面/标题逻辑
+	// 策略：
+	// 1. 第一个 Heading 始终是文档标题 (#)
+	// 2. 在遇到第一个明确的“章节标题”或“正文段落”之前，中间的 Heading 如果很长，则转为正文文本。
+	// 3. 统计 H1，如果只有一个 H1，则尝试将 H2 提升为 H1。
 	
-	return p, nil
+	h1Count := 0
+	for _, b := range blocks {
+		if b.Type == "heading" && b.Level == 1 {
+			h1Count++
+		}
+	}
+	shouldPromote := h1Count <= 1
+
+	firstHeadingFound := false
+	contentStarted := false
+
+	var res strings.Builder
+	for _, b := range blocks {
+		if b.Type == "heading" {
+			level := b.Level
+			
+			if !contentStarted {
+				if !firstHeadingFound {
+					// 文档总标题
+					res.WriteString("\n# " + b.Text + "\n\n")
+					firstHeadingFound = true
+					continue
+				} else {
+					// 封面期间的其他标题
+					// 如果是已知的章节名，则认为内容开始了
+					if isStandardSection(b.Text) {
+						contentStarted = true
+						// 章节名也应该是 #
+						res.WriteString("\n# " + b.Text + "\n\n")
+						continue
+					}
+					// 否则作为封面副标题/文本
+					res.WriteString(b.Text + "\n\n")
+					continue
+				}
+			}
+
+			// 内容已经开始
+			if shouldPromote {
+				if level == 2 {
+					level = 1
+				} else if level == 3 {
+					level = 2
+				}
+			}
+			res.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
+		} else if b.Type == "paragraph" {
+			contentStarted = true // 遇到第一个正文段落，标志着封面/标题期结束
+			res.WriteString(b.Text + "\n\n")
+		} else if b.Type == "table" {
+			contentStarted = true
+			res.WriteString(b.Text + "\n")
+		}
+	}
+
+	return res.String()
+}
+
+func isStandardSection(s string) bool {
+	s = strings.TrimSpace(s)
+	// 常见的章节开头关键词
+	standards := []string{"引言", "摘要", "目录", "前言", "结论", "背景", "概述", "Introduction", "Abstract", "Conclusion", "Summary"}
+	for _, std := range standards {
+		if strings.Contains(s, std) {
+			return true
+		}
+	}
+	// 如果标题带有数字编号且较短，也认为是章节开始
+	if len([]rune(s)) < 20 {
+		if s[0] >= '0' && s[0] <= '9' {
+			return true
+		}
+		if strings.HasPrefix(s, "第") && (strings.Contains(s, "章") || strings.Contains(s, "节") || strings.Contains(s, "部分")) {
+			return true
+		}
+	}
+	return false
+}
+
+func isPageNumber(s string) bool {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return false
+	}
+	isNum := true
+	for _, r := range s {
+		if r < '0' || r > '9' {
+			isNum = false
+			break
+		}
+	}
+	if isNum {
+		return true
+	}
+	lower := strings.ToLower(s)
+	return strings.HasPrefix(lower, "page") || (strings.HasPrefix(s, "-") && strings.HasSuffix(s, "-"))
+}
+
+func isPunctuation(s string) bool {
+	if s == "" {
+		return false
+	}
+	runes := []rune(s)
+	last := runes[len(runes)-1]
+	return strings.ContainsRune("。.!！?？:：", last)
+}
+
+func (p *PDF) identifyTableLine(line struct {
+	Y           float64
+	MaxFontSize float64
+	Text        string
+	Texts       []pdf.Text
+}) (bool, []string) {
+	if len(line.Texts) < 3 {
+		return false, nil
+	}
+	var cells []string
+	var currentCell strings.Builder
+	lastX := -1.0
+	for _, t := range line.Texts {
+		if lastX != -1.0 && t.X-(lastX) > 40 {
+			content := strings.TrimSpace(currentCell.String())
+			if content != "" {
+				cells = append(cells, content)
+			}
+			currentCell.Reset()
+		}
+		currentCell.WriteString(t.S)
+		lastX = t.X + t.W
+	}
+	finalCell := strings.TrimSpace(currentCell.String())
+	if finalCell != "" {
+		cells = append(cells, finalCell)
+	}
+
+	if len(cells) >= 2 {
+		allSingleChar := true
+		for _, c := range cells {
+			r := []rune(c)
+			if len(r) > 1 {
+				allSingleChar = false
+			}
+			if len(r) > 40 {
+				return false, nil
+			}
+		}
+		if allSingleChar {
+			return false, nil
+		}
+		return true, cells
+	}
+	return false, nil
+}
+
+func MathAbs(v float64) float64 {
+	if v < 0 {
+		return -v
+	}
+	return v
 }

 // ToJSON 返回结构化 JSON。
@ -74,7 +404,15 @@ func (p *PDF) ToJSON() string {

 // ToMarkdown 返回 Markdown。
 func (p *PDF) ToMarkdown() string {
-	return p.Content
+	if p.Content == "" {
+		return ""
+	}
+	var sb strings.Builder
+	if title, ok := p.Metadata["title"]; ok && title != "" {
+		sb.WriteString("# " + cast.To[string](title) + "\n\n")
+	}
+	sb.WriteString(p.Content)
+	return sb.String()
 }

 // Save 保存（目前保存为提取后的文本）。
--- a/pptx.go
+++ b/pptx.go
@ -3,6 +3,7 @@ package document
 import (
 	"fmt"
 	"io"
+	"strings"

 	"apigo.cc/go/cast"
 	"apigo.cc/go/file"
@ -46,7 +47,41 @@ func (p *Pptx) ToJSON() string {

 // ToMarkdown 返回 Markdown。
 func (p *Pptx) ToMarkdown() string {
-	return p.Content
+	if p.Content == "" {
+		return ""
+	}
+	
+	lines := strings.Split(p.Content, "\n")
+	var res []string
+	nextIsMainTitle := true
+	
+	for _, line := range lines {
+		trimmed := strings.TrimSpace(line)
+		if trimmed == "" {
+			continue
+		}
+		
+		// 检查幻灯片分隔符
+		if strings.Contains(line, "--------------------------------") {
+			res = append(res, "\n---")
+			nextIsMainTitle = true
+			continue
+		}
+		
+		// 启发式识别标题：行短、无句末标点
+		if len([]rune(trimmed)) < 50 && !strings.HasSuffix(trimmed, ".") && !strings.HasSuffix(trimmed, "。") && !strings.HasSuffix(trimmed, ":") && !strings.HasSuffix(trimmed, "：") {
+			if nextIsMainTitle {
+				res = append(res, "\n# "+trimmed)
+				nextIsMainTitle = false
+			} else {
+				res = append(res, "\n## "+trimmed)
+			}
+		} else {
+			res = append(res, trimmed)
+			// 如果该页已经有了主标题，后续的长文本不会重置标题状态
+		}
+	}
+	return strings.TrimSpace(strings.Join(res, "\n"))
 }

 // Save 保存文档（目前保存为提取后的文本）。