2026-05-12 13:21:03 +08:00
|
|
|
package document
|
2026-05-12 12:30:03 +08:00
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bytes"
|
2026-05-12 13:50:07 +08:00
|
|
|
"fmt"
|
2026-05-12 12:30:03 +08:00
|
|
|
"io"
|
|
|
|
|
"strings"
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
"apigo.cc/go/cast"
|
2026-05-12 12:30:03 +08:00
|
|
|
"apigo.cc/go/file"
|
|
|
|
|
"github.com/dslipak/pdf"
|
|
|
|
|
)
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
// PDF 封装了 PDF 文档的读取与识别。
|
2026-05-12 12:30:03 +08:00
|
|
|
type PDF struct {
|
|
|
|
|
filename string
|
2026-05-12 13:05:16 +08:00
|
|
|
Content string
|
|
|
|
|
Metadata map[string]any
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// OpenPDF 打开一个 PDF 文档。
|
|
|
|
|
func OpenPDF(filename string) (*PDF, error) {
|
|
|
|
|
if !file.Exists(filename) {
|
2026-05-12 13:50:07 +08:00
|
|
|
return nil, fmt.Errorf("file not found: %s", filename)
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
2026-05-12 13:05:16 +08:00
|
|
|
p := &PDF{
|
|
|
|
|
filename: filename,
|
|
|
|
|
Metadata: make(map[string]any),
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
f, err := pdf.Open(filename)
|
|
|
|
|
if err == nil {
|
|
|
|
|
p.Metadata["pages"] = f.NumPage()
|
|
|
|
|
trailer := f.Trailer()
|
|
|
|
|
infoDict := trailer.Key("Info")
|
|
|
|
|
if !infoDict.IsNull() {
|
|
|
|
|
for _, field := range infoDict.Keys() {
|
|
|
|
|
val := infoDict.Key(field).Text()
|
|
|
|
|
if val != "" {
|
|
|
|
|
p.Metadata[strings.ToLower(field)] = val
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var b bytes.Buffer
|
|
|
|
|
if t, err := f.GetPlainText(); err == nil {
|
|
|
|
|
io.Copy(&b, t)
|
|
|
|
|
p.Content = b.String()
|
|
|
|
|
}
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
return p, nil
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
// ToJSON 返回结构化 JSON。
|
|
|
|
|
func (p *PDF) ToJSON() string {
|
|
|
|
|
res, _ := cast.ToJSON(map[string]any{
|
|
|
|
|
"metadata": p.Metadata,
|
|
|
|
|
"content": p.Content,
|
|
|
|
|
})
|
|
|
|
|
return res
|
2026-05-12 12:36:41 +08:00
|
|
|
}
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
// ToMarkdown 返回 Markdown。
|
|
|
|
|
func (p *PDF) ToMarkdown() string {
|
|
|
|
|
return p.Content
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Save 保存(目前保存为提取后的文本)。
|
|
|
|
|
func (p *PDF) Save(filename ...string) error {
|
|
|
|
|
path := p.filename
|
|
|
|
|
if len(filename) > 0 && filename[0] != "" {
|
|
|
|
|
path = filename[0]
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
2026-05-12 13:05:16 +08:00
|
|
|
return file.Write(path, p.Content)
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|