package document import ( "fmt" "sort" "strings" "apigo.cc/go/cast" "apigo.cc/go/file" "github.com/dslipak/pdf" ) // PDF 封装了 PDF 文档的读取与识别。 type PDF struct { filename string Content string Metadata map[string]any } // OpenPDF 打开一个 PDF 文档。 func OpenPDF(filename string) (*PDF, error) { if !file.Exists(filename) { return nil, fmt.Errorf("file not found: %s", filename) } p := &PDF{ filename: filename, Metadata: make(map[string]any), } f, err := pdf.Open(filename) if err != nil { return nil, err } p.Metadata["pages"] = f.NumPage() trailer := f.Trailer() infoDict := trailer.Key("Info") if !infoDict.IsNull() { for _, field := range infoDict.Keys() { val := infoDict.Key(field).Text() if val != "" { p.Metadata[strings.ToLower(field)] = val } } } var sb strings.Builder for i := 1; i <= f.NumPage(); i++ { page := f.Page(i) if page.V.IsNull() { continue } content := page.Content() texts := content.Text if len(texts) == 0 { continue } // 处理页面内容 sb.WriteString(p.processPageTexts(texts)) } p.Content = strings.TrimSpace(sb.String()) return p, nil } func (p *PDF) processPageTexts(texts []pdf.Text) string { if len(texts) == 0 { return "" } // 1. 估算正文字体大小(众数) fontSizes := make(map[int]int) for _, t := range texts { fontSizes[int(t.FontSize)]++ } bodySize := 0 maxCount := 0 for size, count := range fontSizes { if count > maxCount { maxCount = count bodySize = size } } // 2. 按行分组(基于 Y 坐标) type Line struct { Y float64 MaxFontSize float64 Text string Texts []pdf.Text } var lines []Line for _, t := range texts { found := false for i := range lines { if t.Y > lines[i].Y-2 && t.Y < lines[i].Y+2 { lines[i].Texts = append(lines[i].Texts, t) found = true break } } if !found { lines = append(lines, Line{Y: t.Y, Texts: []pdf.Text{t}}) } } // 按 Y 降序排列(从上到下) sort.Slice(lines, func(i, j int) bool { return lines[i].Y > lines[j].Y }) // 预处理每一行的文本和最大字体 for i := range lines { sort.Slice(lines[i].Texts, func(m, n int) bool { return lines[i].Texts[m].X < lines[i].Texts[n].X }) var sb strings.Builder maxFS := 0.0 for _, t := range lines[i].Texts { sb.WriteString(t.S) if t.FontSize > maxFS { maxFS = t.FontSize } } lines[i].Text = strings.TrimSpace(sb.String()) lines[i].MaxFontSize = maxFS } // 3. 语义块识别 type Block struct { Type string // heading, paragraph, table Level int // for heading Text string FontSize float64 } var blocks []Block for i := 0; i < len(lines); i++ { line := lines[i] if line.Text == "" { continue } // 表格识别逻辑 isTableLine, cells := p.identifyTableLine(line) if isTableLine { tableStr := "| " + strings.Join(cells, " | ") + " |" blocks = append(blocks, Block{Type: "table", Text: tableStr}) continue } // 标题识别逻辑 (比正文大) if line.MaxFontSize > float64(bodySize)+1 { level := 1 if line.MaxFontSize < float64(bodySize)+4 { level = 3 } else if line.MaxFontSize < float64(bodySize)+8 { level = 2 } // 合并紧随其后的同字体行(处理跨行标题) fullText := line.Text lastY := line.Y for j := i + 1; j < len(lines); j++ { if lines[j].Text != "" && MathAbs(lines[j].MaxFontSize-line.MaxFontSize) < 1.0 { if MathAbs(lines[j].Y-lastY) < 25 { fullText += " " + lines[j].Text lastY = lines[j].Y i = j } else { break } } else { break } } // 检查是否重复 if i < 5 { if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) { continue } } // 合并同级标题 if len(blocks) > 0 && blocks[len(blocks)-1].Type == "heading" && blocks[len(blocks)-1].Level == level { blocks[len(blocks)-1].Text += " " + fullText } else { blocks = append(blocks, Block{Type: "heading", Level: level, Text: fullText}) } } else { // 跳过页码 if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) { continue } // 段落识别 fullText := line.Text lastY := line.Y for j := i + 1; j < len(lines); j++ { if lines[j].Text != "" && lines[j].MaxFontSize <= float64(bodySize)+1 { isT, _ := p.identifyTableLine(lines[j]) if isT { break } if (lines[j].Y < 50 || lines[j].Y > 800) && isPageNumber(lines[j].Text) { break } if MathAbs(lines[j].Y-lastY) > 25 { break } fullText += lines[j].Text lastY = lines[j].Y i = j if isPunctuation(lines[j].Text) { break } } else { break } } blocks = append(blocks, Block{Type: "paragraph", Text: fullText}) } } // 4. 智能封面/标题逻辑 // 策略: // 1. 第一个 Heading 始终是文档标题 (#) // 2. 在遇到第一个明确的“章节标题”或“正文段落”之前,中间的 Heading 如果很长,则转为正文文本。 // 3. 统计 H1,如果只有一个 H1,则尝试将 H2 提升为 H1。 h1Count := 0 for _, b := range blocks { if b.Type == "heading" && b.Level == 1 { h1Count++ } } shouldPromote := h1Count <= 1 firstHeadingFound := false contentStarted := false var res strings.Builder for _, b := range blocks { if b.Type == "heading" { level := b.Level if !contentStarted { if !firstHeadingFound { // 文档总标题 res.WriteString("\n# " + b.Text + "\n\n") firstHeadingFound = true continue } else { // 封面期间的其他标题 // 如果是已知的章节名,则认为内容开始了 if isStandardSection(b.Text) { contentStarted = true // 章节名也应该是 # res.WriteString("\n# " + b.Text + "\n\n") continue } // 否则作为封面副标题/文本 res.WriteString(b.Text + "\n\n") continue } } // 内容已经开始 if shouldPromote { if level == 2 { level = 1 } else if level == 3 { level = 2 } } res.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n") } else if b.Type == "paragraph" { contentStarted = true // 遇到第一个正文段落,标志着封面/标题期结束 res.WriteString(b.Text + "\n\n") } else if b.Type == "table" { contentStarted = true res.WriteString(b.Text + "\n") } } return res.String() } func isStandardSection(s string) bool { s = strings.TrimSpace(s) // 常见的章节开头关键词 standards := []string{"引言", "摘要", "目录", "前言", "结论", "背景", "概述", "Introduction", "Abstract", "Conclusion", "Summary"} for _, std := range standards { if strings.Contains(s, std) { return true } } // 如果标题带有数字编号且较短,也认为是章节开始 if len([]rune(s)) < 20 { if s[0] >= '0' && s[0] <= '9' { return true } if strings.HasPrefix(s, "第") && (strings.Contains(s, "章") || strings.Contains(s, "节") || strings.Contains(s, "部分")) { return true } } return false } func isPageNumber(s string) bool { s = strings.TrimSpace(s) if s == "" { return false } isNum := true for _, r := range s { if r < '0' || r > '9' { isNum = false break } } if isNum { return true } lower := strings.ToLower(s) return strings.HasPrefix(lower, "page") || (strings.HasPrefix(s, "-") && strings.HasSuffix(s, "-")) } func isPunctuation(s string) bool { if s == "" { return false } runes := []rune(s) last := runes[len(runes)-1] return strings.ContainsRune("。.!!??::", last) } func (p *PDF) identifyTableLine(line struct { Y float64 MaxFontSize float64 Text string Texts []pdf.Text }) (bool, []string) { if len(line.Texts) < 3 { return false, nil } var cells []string var currentCell strings.Builder lastX := -1.0 for _, t := range line.Texts { if lastX != -1.0 && t.X-(lastX) > 40 { content := strings.TrimSpace(currentCell.String()) if content != "" { cells = append(cells, content) } currentCell.Reset() } currentCell.WriteString(t.S) lastX = t.X + t.W } finalCell := strings.TrimSpace(currentCell.String()) if finalCell != "" { cells = append(cells, finalCell) } if len(cells) >= 2 { allSingleChar := true for _, c := range cells { r := []rune(c) if len(r) > 1 { allSingleChar = false } if len(r) > 40 { return false, nil } } if allSingleChar { return false, nil } return true, cells } return false, nil } func MathAbs(v float64) float64 { if v < 0 { return -v } return v } // ToJSON 返回结构化 JSON。 func (p *PDF) ToJSON() string { res, _ := cast.ToJSON(map[string]any{ "metadata": p.Metadata, "content": p.Content, }) return res } // ToMarkdown 返回 Markdown。 func (p *PDF) ToMarkdown() string { if p.Content == "" { return "" } var sb strings.Builder if title, ok := p.Metadata["title"]; ok && title != "" { sb.WriteString("# " + cast.To[string](title) + "\n\n") } sb.WriteString(p.Content) return sb.String() } // Save 保存(目前保存为提取后的文本)。 func (p *PDF) Save(filename ...string) error { path := p.filename if len(filename) > 0 && filename[0] != "" { path = filename[0] } return file.Write(path, p.Content) }