chore: infrastructure alignment and doc sync (by codervall)

2026-05-18 19:51:35 +08:00 · 2026-05-18 19:51:35 +08:00 · b71cb7f48f
commit b71cb7f48f
parent 7d7dfd78e5
2 changed files with 175 additions and 178 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,18 @@
 # CHANGELOG
 ## v1.1.0 (2026-05-17)
 - **PDF 语义重构**: 引入全局语义分析引擎。
    - **无缝流**: 彻底移除分页干扰（移除 `---` 和 `Page X` 标记），实现跨页内容自然合并。
    - **智能标题层级**: 自动锁定文档总标题，智能识别并合并跨行长标题，避免封面期标题碎片化。
    - **自动平衡**: 根据文档内容自动提升章节标题层级，确保 Markdown 目录结构平衡。
    - **鲁棒表格识别**: 调优列间距算法并增加长句过滤，大幅降低 PDF 误判表格的概率。
 - **转换能力全面对齐**:
    - **PPTX**: 每一页幻灯片标题现在统一识别为 `#` 一级标题，优化页面视觉分层。
    - **DOCX**: 重构 XML 解析，完美支持 Word 原生表格 (`w:tbl`) 转换为 Markdown 表格。
    - **XLSX**: 提升 Sheet 名称为 `#` 标题。
 - **工程化增强**: 新增 `test_res/testmd.sh` 自动化验证脚本，覆盖 4 种主流办公格式的 Markdown 转换质量。
 - **接口一致性**: 强制所有格式 Markdown 输出从一级标题 (`#`) 开始。
 ## v1.0.9 (2026-05-17)
 - **新特性**: 内置功能完整的命令行工具 `doc` (`cmd/doc`)。
 - **功能增强**: `doc` 支持创建、打开、查看预览（Markdown）、转换为 JSON、查看元数据（Inspect）以及数据注入（Excel）。
--- a/pdf.go
+++ b/pdf.go
@ -44,7 +44,15 @@ func OpenPDF(filename string) (*PDF, error) {
 		}
 	}
-	var sb strings.Builder
+	// 收集所有页面的 block
 	type Block struct {
 		Type     string
 		Level    int
 		Text     string
 		FontSize float64
 	}
 	var allBlocks []Block
 	for i := 1; i <= f.NumPage(); i++ {
 		page := f.Page(i)
 		if page.V.IsNull() {
@ -57,219 +65,194 @@ func OpenPDF(filename string) (*PDF, error) {
 			continue
 		}
-		// 处理页面内容
+		// 1. 估算正文字体大小（众数）
-		sb.WriteString(p.processPageTexts(texts))
+		fontSizes := make(map[int]int)
-	}
+		for _, t := range texts {
-	p.Content = strings.TrimSpace(sb.String())
+			fontSizes[int(t.FontSize)]++
 	return p, nil
 }
 func (p *PDF) processPageTexts(texts []pdf.Text) string {
 	if len(texts) == 0 {
 		return ""
 	}
 	// 1. 估算正文字体大小（众数）
 	fontSizes := make(map[int]int)
 	for _, t := range texts {
 		fontSizes[int(t.FontSize)]++
 	}
 	bodySize := 0
 	maxCount := 0
 	for size, count := range fontSizes {
 		if count > maxCount {
 			maxCount = count
 			bodySize = size
 		}
-	}
+		bodySize := 0
-
+		maxCount := 0
-	// 2. 按行分组（基于 Y 坐标）
+		for size, count := range fontSizes {
-	type Line struct {
+			if count > maxCount {
-		Y           float64
+				maxCount = count
-		MaxFontSize float64
+				bodySize = size
 		Text        string
 		Texts       []pdf.Text
 	}
 	var lines []Line
 	for _, t := range texts {
 		found := false
 		for i := range lines {
 			if t.Y > lines[i].Y-2 && t.Y < lines[i].Y+2 {
 				lines[i].Texts = append(lines[i].Texts, t)
 				found = true
 				break
 			}
 		}
-		if !found {
+
-			lines = append(lines, Line{Y: t.Y, Texts: []pdf.Text{t}})
+		// 2. 按行分组（基于 Y 坐标）
 		type Line struct {
 			Y           float64
 			MaxFontSize float64
 			Text        string
 			Texts       []pdf.Text
 		}
-	}
+		var lines []Line
-
+		for _, t := range texts {
-	// 按 Y 降序排列（从上到下）
+			found := false
-	sort.Slice(lines, func(i, j int) bool {
+			for i := range lines {
-		return lines[i].Y > lines[j].Y
+				if t.Y > lines[i].Y-2 && t.Y < lines[i].Y+2 {
-	})
+					lines[i].Texts = append(lines[i].Texts, t)
-
+					found = true
 	// 预处理每一行的文本和最大字体
 	for i := range lines {
 		sort.Slice(lines[i].Texts, func(m, n int) bool {
 			return lines[i].Texts[m].X < lines[i].Texts[n].X
 		})
 		var sb strings.Builder
 		maxFS := 0.0
 		for _, t := range lines[i].Texts {
 			sb.WriteString(t.S)
 			if t.FontSize > maxFS {
 				maxFS = t.FontSize
 			}
 		}
 		lines[i].Text = strings.TrimSpace(sb.String())
 		lines[i].MaxFontSize = maxFS
 	}
 	// 3. 语义块识别
 	type Block struct {
 		Type     string // heading, paragraph, table
 		Level    int    // for heading
 		Text     string
 		FontSize float64
 	}
 	var blocks []Block
 	for i := 0; i < len(lines); i++ {
 		line := lines[i]
 		if line.Text == "" {
 			continue
 		}
 		// 表格识别逻辑
 		isTableLine, cells := p.identifyTableLine(line)
 		if isTableLine {
 			tableStr := "| " + strings.Join(cells, " | ") + " |"
 			blocks = append(blocks, Block{Type: "table", Text: tableStr})
 			continue
 		}
 		// 标题识别逻辑 (比正文大)
 		if line.MaxFontSize > float64(bodySize)+1 {
 			level := 1
 			if line.MaxFontSize < float64(bodySize)+4 {
 				level = 3
 			} else if line.MaxFontSize < float64(bodySize)+8 {
 				level = 2
 			}
 			// 合并紧随其后的同字体行（处理跨行标题）
 			fullText := line.Text
 			lastY := line.Y
 			for j := i + 1; j < len(lines); j++ {
 				if lines[j].Text != "" && MathAbs(lines[j].MaxFontSize-line.MaxFontSize) < 1.0 {
 					if MathAbs(lines[j].Y-lastY) < 25 {
 						fullText += " " + lines[j].Text
 						lastY = lines[j].Y
 						i = j
 					} else {
 						break
 					}
 				} else {
 					break
 				}
 			}
 			if !found {
 				lines = append(lines, Line{Y: t.Y, Texts: []pdf.Text{t}})
 			}
 		}
-			// 检查是否重复
+		sort.Slice(lines, func(i, j int) bool {
-			if i < 5 {
+			return lines[i].Y > lines[j].Y
-				if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
+		})
-					continue
+
 		for i := range lines {
 			sort.Slice(lines[i].Texts, func(m, n int) bool {
 				return lines[i].Texts[m].X < lines[i].Texts[n].X
 			})
 			var sb strings.Builder
 			maxFS := 0.0
 			for _, t := range lines[i].Texts {
 				sb.WriteString(t.S)
 				if t.FontSize > maxFS {
 					maxFS = t.FontSize
 				}
 			}
 			lines[i].Text = strings.TrimSpace(sb.String())
 			lines[i].MaxFontSize = maxFS
 		}
-			// 合并同级标题
+		// 3. 语义块识别
-			if len(blocks) > 0 && blocks[len(blocks)-1].Type == "heading" && blocks[len(blocks)-1].Level == level {
+		for j := 0; j < len(lines); j++ {
-				blocks[len(blocks)-1].Text += " " + fullText
+			line := lines[j]
-			} else {
+			if line.Text == "" {
 				blocks = append(blocks, Block{Type: "heading", Level: level, Text: fullText})
 			}
 		} else {
 			// 跳过页码
 			if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
 				continue
 			}
-			// 段落识别
+			isTableLine, cells := p.identifyTableLine(line)
-			fullText := line.Text
+			if isTableLine {
-			lastY := line.Y
+				tableStr := "| " + strings.Join(cells, " | ") + " |"
-			for j := i + 1; j < len(lines); j++ {
+				allBlocks = append(allBlocks, Block{Type: "table", Text: tableStr})
-				if lines[j].Text != "" && lines[j].MaxFontSize <= float64(bodySize)+1 {
+				continue
-					isT, _ := p.identifyTableLine(lines[j])
+			}
-					if isT {
+
-						break
+			if line.MaxFontSize > float64(bodySize)+1 {
-					}
+				level := 1
-					if (lines[j].Y < 50 || lines[j].Y > 800) && isPageNumber(lines[j].Text) {
+				if line.MaxFontSize < float64(bodySize)+4 {
-						break
+					level = 3
-					}
+				} else if line.MaxFontSize < float64(bodySize)+8 {
-					if MathAbs(lines[j].Y-lastY) > 25 {
+					level = 2
-						break
+				}
-					}
+
-
+				fullText := line.Text
-					fullText += lines[j].Text
+				lastY := line.Y
-					lastY = lines[j].Y
+				for k := j + 1; k < len(lines); k++ {
-					i = j
+					if lines[k].Text != "" && MathAbs(lines[k].MaxFontSize-line.MaxFontSize) < 1.0 {
-
+						if MathAbs(lines[k].Y-lastY) < 25 {
-					if isPunctuation(lines[j].Text) {
+							fullText += " " + lines[k].Text
-						break
+							lastY = lines[k].Y
-					}
+							j = k
-				} else {
+						} else {
-					break
+							break
-				}
+						}
 					} else {
 						break
 					}
 				}
 				if j < 5 && i == 1 {
 					if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
 						continue
 					}
 				}
 				if len(allBlocks) > 0 && allBlocks[len(allBlocks)-1].Type == "heading" && allBlocks[len(allBlocks)-1].Level == level {
 					allBlocks[len(allBlocks)-1].Text += " " + fullText
 				} else {
 					allBlocks = append(allBlocks, Block{Type: "heading", Level: level, Text: fullText})
 				}
 			} else {
 				if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
 					continue
 				}
 				fullText := line.Text
 				lastY := line.Y
 				for k := j + 1; k < len(lines); k++ {
 					if lines[k].Text != "" && lines[k].MaxFontSize <= float64(bodySize)+1 {
 						isT, _ := p.identifyTableLine(lines[k])
 						if isT {
 							break
 						}
 						if (lines[k].Y < 50 || lines[k].Y > 800) && isPageNumber(lines[k].Text) {
 							break
 						}
 						if MathAbs(lines[k].Y-lastY) > 25 {
 							break
 						}
 						fullText += lines[k].Text
 						lastY = lines[k].Y
 						j = k
 						if isPunctuation(lines[k].Text) {
 							break
 						}
 					} else {
 						break
 					}
 				}
 				allBlocks = append(allBlocks, Block{Type: "paragraph", Text: fullText})
 			}
 			blocks = append(blocks, Block{Type: "paragraph", Text: fullText})
 		}
 	}
-	// 4. 智能封面/标题逻辑
+	// 4. 智能封面/标题逻辑 (全局)
 	// 策略：
 	// 1. 第一个 Heading 始终是文档标题 (#)
 	// 2. 在遇到第一个明确的“章节标题”或“正文段落”之前，中间的 Heading 如果很长，则转为正文文本。
 	// 3. 统计 H1，如果只有一个 H1，则尝试将 H2 提升为 H1。
 	h1Count := 0
-	for _, b := range blocks {
+	for _, b := range allBlocks {
 		if b.Type == "heading" && b.Level == 1 {
 			h1Count++
 		}
 	}
 	shouldPromote := h1Count <= 1
 	firstHeadingFound := false
 	contentStarted := false
 	hasMetadataTitle := false
 	if t, ok := p.Metadata["title"].(string); ok && t != "" {
 		hasMetadataTitle = true
 	}
-	var res strings.Builder
+	firstHeadingProcessed := false
-	for _, b := range blocks {
+
 	var sb strings.Builder
 	for _, b := range allBlocks {
 		if b.Type == "heading" {
 			level := b.Level
 			if !contentStarted {
-				if !firstHeadingFound {
+				if !firstHeadingProcessed {
-					// 文档总标题
+					firstHeadingProcessed = true
-					res.WriteString("\n# " + b.Text + "\n\n")
+					// 如果有 Metadata Title，则 Metadata Title 充当了真正的第一级标题
-					firstHeadingFound = true
+					// 我们把遇到的第一个大标题降级为正文文本（除非它已经是标准章节）
-					continue
+					if hasMetadataTitle && !isStandardSection(b.Text) {
 						sb.WriteString("\n" + b.Text + "\n\n")
 						continue
 					} else {
 						// 否则作为文档的主标题
 						sb.WriteString("\n# " + b.Text + "\n\n")
 						continue
 					}
 				} else {
 					// 封面期间的其他标题
 					// 如果是已知的章节名，则认为内容开始了
 					if isStandardSection(b.Text) {
 						contentStarted = true
-						// 章节名也应该是 #
+						sb.WriteString("\n# " + b.Text + "\n\n")
 						res.WriteString("\n# " + b.Text + "\n\n")
 						continue
 					}
 					// 否则作为封面副标题/文本
-					res.WriteString(b.Text + "\n\n")
+					sb.WriteString(b.Text + "\n\n")
 					continue
 				}
 			}
 			// 内容已经开始
 			if shouldPromote {
 				if level == 2 {
 					level = 1
@ -277,17 +260,18 @@ func (p *PDF) processPageTexts(texts []pdf.Text) string {
 					level = 2
 				}
 			}
-			res.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
+			sb.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
 		} else if b.Type == "paragraph" {
-			contentStarted = true // 遇到第一个正文段落，标志着封面/标题期结束
+			contentStarted = true
-			res.WriteString(b.Text + "\n\n")
+			sb.WriteString(b.Text + "\n\n")
 		} else if b.Type == "table" {
 			contentStarted = true
-			res.WriteString(b.Text + "\n")
+			sb.WriteString(b.Text + "\n")
 		}
 	}
-	return res.String()
+	p.Content = strings.TrimSpace(sb.String())
 	return p, nil
 }
 func isStandardSection(s string) bool {