2026-05-12 13:21:03 +08:00
|
|
|
|
package document
|
2026-05-12 12:30:03 +08:00
|
|
|
|
|
|
|
|
|
|
import (
|
2026-05-12 13:50:07 +08:00
|
|
|
|
"fmt"
|
2026-05-12 12:30:03 +08:00
|
|
|
|
"io"
|
2026-05-17 11:53:26 +08:00
|
|
|
|
"strings"
|
2026-05-12 12:30:03 +08:00
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
|
"apigo.cc/go/cast"
|
2026-05-12 12:30:03 +08:00
|
|
|
|
"apigo.cc/go/file"
|
|
|
|
|
|
"github.com/young2j/oxmltotext/pptxtotext"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
|
// Pptx 封装了 PowerPoint 文档的读取与识别。
|
2026-05-12 12:30:03 +08:00
|
|
|
|
type Pptx struct {
|
|
|
|
|
|
filename string
|
2026-05-12 13:05:16 +08:00
|
|
|
|
Content string
|
|
|
|
|
|
Metadata map[string]any
|
2026-05-12 12:30:03 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// OpenPptx 打开一个 PowerPoint 文档 (.pptx)。
|
|
|
|
|
|
func OpenPptx(filename string) (*Pptx, error) {
|
|
|
|
|
|
if !file.Exists(filename) {
|
2026-05-12 13:50:07 +08:00
|
|
|
|
return nil, fmt.Errorf("file not found: %s", filename)
|
2026-05-12 12:30:03 +08:00
|
|
|
|
}
|
2026-05-12 13:05:16 +08:00
|
|
|
|
p := &Pptx{
|
|
|
|
|
|
filename: filename,
|
|
|
|
|
|
Metadata: make(map[string]any),
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pp, err := pptxtotext.Open(filename)
|
|
|
|
|
|
if err == nil {
|
|
|
|
|
|
defer pp.Close()
|
|
|
|
|
|
p.Content, _ = pp.ExtractTexts()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return p, nil
|
2026-05-12 12:30:03 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
|
// ToJSON 返回结构化 JSON。
|
|
|
|
|
|
func (p *Pptx) ToJSON() string {
|
|
|
|
|
|
res, _ := cast.ToJSON(map[string]any{
|
|
|
|
|
|
"metadata": p.Metadata,
|
|
|
|
|
|
"content": p.Content,
|
|
|
|
|
|
})
|
|
|
|
|
|
return res
|
|
|
|
|
|
}
|
2026-05-12 12:30:03 +08:00
|
|
|
|
|
2026-05-12 13:05:16 +08:00
|
|
|
|
// ToMarkdown 返回 Markdown。
|
|
|
|
|
|
func (p *Pptx) ToMarkdown() string {
|
2026-05-17 11:53:26 +08:00
|
|
|
|
if p.Content == "" {
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
lines := strings.Split(p.Content, "\n")
|
|
|
|
|
|
var res []string
|
|
|
|
|
|
nextIsMainTitle := true
|
|
|
|
|
|
|
|
|
|
|
|
for _, line := range lines {
|
|
|
|
|
|
trimmed := strings.TrimSpace(line)
|
|
|
|
|
|
if trimmed == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 检查幻灯片分隔符
|
|
|
|
|
|
if strings.Contains(line, "--------------------------------") {
|
|
|
|
|
|
res = append(res, "\n---")
|
|
|
|
|
|
nextIsMainTitle = true
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 启发式识别标题:行短、无句末标点
|
|
|
|
|
|
if len([]rune(trimmed)) < 50 && !strings.HasSuffix(trimmed, ".") && !strings.HasSuffix(trimmed, "。") && !strings.HasSuffix(trimmed, ":") && !strings.HasSuffix(trimmed, ":") {
|
|
|
|
|
|
if nextIsMainTitle {
|
|
|
|
|
|
res = append(res, "\n# "+trimmed)
|
|
|
|
|
|
nextIsMainTitle = false
|
|
|
|
|
|
} else {
|
|
|
|
|
|
res = append(res, "\n## "+trimmed)
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
res = append(res, trimmed)
|
|
|
|
|
|
// 如果该页已经有了主标题,后续的长文本不会重置标题状态
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return strings.TrimSpace(strings.Join(res, "\n"))
|
2026-05-12 13:05:16 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Save 保存文档(目前保存为提取后的文本)。
|
|
|
|
|
|
func (p *Pptx) Save(filename ...string) error {
|
|
|
|
|
|
path := p.filename
|
|
|
|
|
|
if len(filename) > 0 && filename[0] != "" {
|
|
|
|
|
|
path = filename[0]
|
|
|
|
|
|
}
|
|
|
|
|
|
return file.Write(path, p.Content)
|
2026-05-12 12:30:03 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ReadText 从 io.Reader 中读取并提取 PPT 文本。
|
2026-05-12 12:36:41 +08:00
|
|
|
|
func (p *Pptx) ReadText(r io.ReaderAt, size int64) (string, error) {
|
|
|
|
|
|
pp, err := pptxtotext.OpenReader(r, size)
|
2026-05-12 12:30:03 +08:00
|
|
|
|
if err != nil {
|
|
|
|
|
|
return "", err
|
|
|
|
|
|
}
|
2026-05-12 12:36:41 +08:00
|
|
|
|
defer pp.Close()
|
|
|
|
|
|
|
|
|
|
|
|
return pp.ExtractTexts()
|
|
|
|
|
|
}
|