426 lines
9.3 KiB
Go
426 lines
9.3 KiB
Go
package document
|
||
|
||
import (
|
||
"fmt"
|
||
"sort"
|
||
"strings"
|
||
|
||
"apigo.cc/go/cast"
|
||
"apigo.cc/go/file"
|
||
"github.com/dslipak/pdf"
|
||
)
|
||
|
||
// PDF 封装了 PDF 文档的读取与识别。
|
||
type PDF struct {
|
||
filename string
|
||
Content string
|
||
Metadata map[string]any
|
||
}
|
||
|
||
// OpenPDF 打开一个 PDF 文档。
|
||
func OpenPDF(filename string) (*PDF, error) {
|
||
if !file.Exists(filename) {
|
||
return nil, fmt.Errorf("file not found: %s", filename)
|
||
}
|
||
p := &PDF{
|
||
filename: filename,
|
||
Metadata: make(map[string]any),
|
||
}
|
||
|
||
f, err := pdf.Open(filename)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
p.Metadata["pages"] = f.NumPage()
|
||
trailer := f.Trailer()
|
||
infoDict := trailer.Key("Info")
|
||
if !infoDict.IsNull() {
|
||
for _, field := range infoDict.Keys() {
|
||
val := infoDict.Key(field).Text()
|
||
if val != "" {
|
||
p.Metadata[strings.ToLower(field)] = val
|
||
}
|
||
}
|
||
}
|
||
|
||
var sb strings.Builder
|
||
for i := 1; i <= f.NumPage(); i++ {
|
||
page := f.Page(i)
|
||
if page.V.IsNull() {
|
||
continue
|
||
}
|
||
|
||
content := page.Content()
|
||
texts := content.Text
|
||
if len(texts) == 0 {
|
||
continue
|
||
}
|
||
|
||
// 处理页面内容
|
||
sb.WriteString(p.processPageTexts(texts))
|
||
}
|
||
p.Content = strings.TrimSpace(sb.String())
|
||
|
||
return p, nil
|
||
}
|
||
|
||
func (p *PDF) processPageTexts(texts []pdf.Text) string {
|
||
if len(texts) == 0 {
|
||
return ""
|
||
}
|
||
|
||
// 1. 估算正文字体大小(众数)
|
||
fontSizes := make(map[int]int)
|
||
for _, t := range texts {
|
||
fontSizes[int(t.FontSize)]++
|
||
}
|
||
bodySize := 0
|
||
maxCount := 0
|
||
for size, count := range fontSizes {
|
||
if count > maxCount {
|
||
maxCount = count
|
||
bodySize = size
|
||
}
|
||
}
|
||
|
||
// 2. 按行分组(基于 Y 坐标)
|
||
type Line struct {
|
||
Y float64
|
||
MaxFontSize float64
|
||
Text string
|
||
Texts []pdf.Text
|
||
}
|
||
var lines []Line
|
||
for _, t := range texts {
|
||
found := false
|
||
for i := range lines {
|
||
if t.Y > lines[i].Y-2 && t.Y < lines[i].Y+2 {
|
||
lines[i].Texts = append(lines[i].Texts, t)
|
||
found = true
|
||
break
|
||
}
|
||
}
|
||
if !found {
|
||
lines = append(lines, Line{Y: t.Y, Texts: []pdf.Text{t}})
|
||
}
|
||
}
|
||
|
||
// 按 Y 降序排列(从上到下)
|
||
sort.Slice(lines, func(i, j int) bool {
|
||
return lines[i].Y > lines[j].Y
|
||
})
|
||
|
||
// 预处理每一行的文本和最大字体
|
||
for i := range lines {
|
||
sort.Slice(lines[i].Texts, func(m, n int) bool {
|
||
return lines[i].Texts[m].X < lines[i].Texts[n].X
|
||
})
|
||
var sb strings.Builder
|
||
maxFS := 0.0
|
||
for _, t := range lines[i].Texts {
|
||
sb.WriteString(t.S)
|
||
if t.FontSize > maxFS {
|
||
maxFS = t.FontSize
|
||
}
|
||
}
|
||
lines[i].Text = strings.TrimSpace(sb.String())
|
||
lines[i].MaxFontSize = maxFS
|
||
}
|
||
|
||
// 3. 语义块识别
|
||
type Block struct {
|
||
Type string // heading, paragraph, table
|
||
Level int // for heading
|
||
Text string
|
||
FontSize float64
|
||
}
|
||
var blocks []Block
|
||
for i := 0; i < len(lines); i++ {
|
||
line := lines[i]
|
||
if line.Text == "" {
|
||
continue
|
||
}
|
||
|
||
// 表格识别逻辑
|
||
isTableLine, cells := p.identifyTableLine(line)
|
||
if isTableLine {
|
||
tableStr := "| " + strings.Join(cells, " | ") + " |"
|
||
blocks = append(blocks, Block{Type: "table", Text: tableStr})
|
||
continue
|
||
}
|
||
|
||
// 标题识别逻辑 (比正文大)
|
||
if line.MaxFontSize > float64(bodySize)+1 {
|
||
level := 1
|
||
if line.MaxFontSize < float64(bodySize)+4 {
|
||
level = 3
|
||
} else if line.MaxFontSize < float64(bodySize)+8 {
|
||
level = 2
|
||
}
|
||
|
||
// 合并紧随其后的同字体行(处理跨行标题)
|
||
fullText := line.Text
|
||
lastY := line.Y
|
||
for j := i + 1; j < len(lines); j++ {
|
||
if lines[j].Text != "" && MathAbs(lines[j].MaxFontSize-line.MaxFontSize) < 1.0 {
|
||
if MathAbs(lines[j].Y-lastY) < 25 {
|
||
fullText += " " + lines[j].Text
|
||
lastY = lines[j].Y
|
||
i = j
|
||
} else {
|
||
break
|
||
}
|
||
} else {
|
||
break
|
||
}
|
||
}
|
||
|
||
// 检查是否重复
|
||
if i < 5 {
|
||
if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
|
||
continue
|
||
}
|
||
}
|
||
|
||
// 合并同级标题
|
||
if len(blocks) > 0 && blocks[len(blocks)-1].Type == "heading" && blocks[len(blocks)-1].Level == level {
|
||
blocks[len(blocks)-1].Text += " " + fullText
|
||
} else {
|
||
blocks = append(blocks, Block{Type: "heading", Level: level, Text: fullText})
|
||
}
|
||
} else {
|
||
// 跳过页码
|
||
if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
|
||
continue
|
||
}
|
||
|
||
// 段落识别
|
||
fullText := line.Text
|
||
lastY := line.Y
|
||
for j := i + 1; j < len(lines); j++ {
|
||
if lines[j].Text != "" && lines[j].MaxFontSize <= float64(bodySize)+1 {
|
||
isT, _ := p.identifyTableLine(lines[j])
|
||
if isT {
|
||
break
|
||
}
|
||
if (lines[j].Y < 50 || lines[j].Y > 800) && isPageNumber(lines[j].Text) {
|
||
break
|
||
}
|
||
if MathAbs(lines[j].Y-lastY) > 25 {
|
||
break
|
||
}
|
||
|
||
fullText += lines[j].Text
|
||
lastY = lines[j].Y
|
||
i = j
|
||
|
||
if isPunctuation(lines[j].Text) {
|
||
break
|
||
}
|
||
} else {
|
||
break
|
||
}
|
||
}
|
||
blocks = append(blocks, Block{Type: "paragraph", Text: fullText})
|
||
}
|
||
}
|
||
|
||
// 4. 智能封面/标题逻辑
|
||
// 策略:
|
||
// 1. 第一个 Heading 始终是文档标题 (#)
|
||
// 2. 在遇到第一个明确的“章节标题”或“正文段落”之前,中间的 Heading 如果很长,则转为正文文本。
|
||
// 3. 统计 H1,如果只有一个 H1,则尝试将 H2 提升为 H1。
|
||
|
||
h1Count := 0
|
||
for _, b := range blocks {
|
||
if b.Type == "heading" && b.Level == 1 {
|
||
h1Count++
|
||
}
|
||
}
|
||
shouldPromote := h1Count <= 1
|
||
|
||
firstHeadingFound := false
|
||
contentStarted := false
|
||
|
||
var res strings.Builder
|
||
for _, b := range blocks {
|
||
if b.Type == "heading" {
|
||
level := b.Level
|
||
|
||
if !contentStarted {
|
||
if !firstHeadingFound {
|
||
// 文档总标题
|
||
res.WriteString("\n# " + b.Text + "\n\n")
|
||
firstHeadingFound = true
|
||
continue
|
||
} else {
|
||
// 封面期间的其他标题
|
||
// 如果是已知的章节名,则认为内容开始了
|
||
if isStandardSection(b.Text) {
|
||
contentStarted = true
|
||
// 章节名也应该是 #
|
||
res.WriteString("\n# " + b.Text + "\n\n")
|
||
continue
|
||
}
|
||
// 否则作为封面副标题/文本
|
||
res.WriteString(b.Text + "\n\n")
|
||
continue
|
||
}
|
||
}
|
||
|
||
// 内容已经开始
|
||
if shouldPromote {
|
||
if level == 2 {
|
||
level = 1
|
||
} else if level == 3 {
|
||
level = 2
|
||
}
|
||
}
|
||
res.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
|
||
} else if b.Type == "paragraph" {
|
||
contentStarted = true // 遇到第一个正文段落,标志着封面/标题期结束
|
||
res.WriteString(b.Text + "\n\n")
|
||
} else if b.Type == "table" {
|
||
contentStarted = true
|
||
res.WriteString(b.Text + "\n")
|
||
}
|
||
}
|
||
|
||
return res.String()
|
||
}
|
||
|
||
func isStandardSection(s string) bool {
|
||
s = strings.TrimSpace(s)
|
||
// 常见的章节开头关键词
|
||
standards := []string{"引言", "摘要", "目录", "前言", "结论", "背景", "概述", "Introduction", "Abstract", "Conclusion", "Summary"}
|
||
for _, std := range standards {
|
||
if strings.Contains(s, std) {
|
||
return true
|
||
}
|
||
}
|
||
// 如果标题带有数字编号且较短,也认为是章节开始
|
||
if len([]rune(s)) < 20 {
|
||
if s[0] >= '0' && s[0] <= '9' {
|
||
return true
|
||
}
|
||
if strings.HasPrefix(s, "第") && (strings.Contains(s, "章") || strings.Contains(s, "节") || strings.Contains(s, "部分")) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
func isPageNumber(s string) bool {
|
||
s = strings.TrimSpace(s)
|
||
if s == "" {
|
||
return false
|
||
}
|
||
isNum := true
|
||
for _, r := range s {
|
||
if r < '0' || r > '9' {
|
||
isNum = false
|
||
break
|
||
}
|
||
}
|
||
if isNum {
|
||
return true
|
||
}
|
||
lower := strings.ToLower(s)
|
||
return strings.HasPrefix(lower, "page") || (strings.HasPrefix(s, "-") && strings.HasSuffix(s, "-"))
|
||
}
|
||
|
||
func isPunctuation(s string) bool {
|
||
if s == "" {
|
||
return false
|
||
}
|
||
runes := []rune(s)
|
||
last := runes[len(runes)-1]
|
||
return strings.ContainsRune("。.!!??::", last)
|
||
}
|
||
|
||
func (p *PDF) identifyTableLine(line struct {
|
||
Y float64
|
||
MaxFontSize float64
|
||
Text string
|
||
Texts []pdf.Text
|
||
}) (bool, []string) {
|
||
if len(line.Texts) < 3 {
|
||
return false, nil
|
||
}
|
||
var cells []string
|
||
var currentCell strings.Builder
|
||
lastX := -1.0
|
||
for _, t := range line.Texts {
|
||
if lastX != -1.0 && t.X-(lastX) > 40 {
|
||
content := strings.TrimSpace(currentCell.String())
|
||
if content != "" {
|
||
cells = append(cells, content)
|
||
}
|
||
currentCell.Reset()
|
||
}
|
||
currentCell.WriteString(t.S)
|
||
lastX = t.X + t.W
|
||
}
|
||
finalCell := strings.TrimSpace(currentCell.String())
|
||
if finalCell != "" {
|
||
cells = append(cells, finalCell)
|
||
}
|
||
|
||
if len(cells) >= 2 {
|
||
allSingleChar := true
|
||
for _, c := range cells {
|
||
r := []rune(c)
|
||
if len(r) > 1 {
|
||
allSingleChar = false
|
||
}
|
||
if len(r) > 40 {
|
||
return false, nil
|
||
}
|
||
}
|
||
if allSingleChar {
|
||
return false, nil
|
||
}
|
||
return true, cells
|
||
}
|
||
return false, nil
|
||
}
|
||
|
||
func MathAbs(v float64) float64 {
|
||
if v < 0 {
|
||
return -v
|
||
}
|
||
return v
|
||
}
|
||
|
||
// ToJSON 返回结构化 JSON。
|
||
func (p *PDF) ToJSON() string {
|
||
res, _ := cast.ToJSON(map[string]any{
|
||
"metadata": p.Metadata,
|
||
"content": p.Content,
|
||
})
|
||
return res
|
||
}
|
||
|
||
// ToMarkdown 返回 Markdown。
|
||
func (p *PDF) ToMarkdown() string {
|
||
if p.Content == "" {
|
||
return ""
|
||
}
|
||
var sb strings.Builder
|
||
if title, ok := p.Metadata["title"]; ok && title != "" {
|
||
sb.WriteString("# " + cast.To[string](title) + "\n\n")
|
||
}
|
||
sb.WriteString(p.Content)
|
||
return sb.String()
|
||
}
|
||
|
||
// Save 保存(目前保存为提取后的文本)。
|
||
func (p *PDF) Save(filename ...string) error {
|
||
path := p.filename
|
||
if len(filename) > 0 && filename[0] != "" {
|
||
path = filename[0]
|
||
}
|
||
return file.Write(path, p.Content)
|
||
}
|