Compare commits

..

No commits in common. "main" and "v1.0.7" have entirely different histories.
main ... v1.0.7

11 changed files with 59 additions and 791 deletions

1
.gitignore vendored
View File

@ -5,4 +5,3 @@ env.json
env.yml env.yml
env.yaml env.yaml
.log.meta.json .log.meta.json
/test_res/

View File

@ -1,35 +1,5 @@
# CHANGELOG # CHANGELOG
## v1.5.1 (2026-06-08)
- **JS 对齐 & 智能文档**:
- 将所有注册到 `jsmod` 的方法名统一为 PascalCase。
- **可选参数优化**: 将 `Open``password``Save``filename` 改为指针类型。配合最新的 `go/js` 引擎,生成的 `.d.ts` 定义将正确提示为可选参数 `?`,极大优化了 AI 编码体验。
## v1.1.0 (2026-05-17)
- **PDF 语义重构**: 引入全局语义分析引擎。
- **无缝流**: 彻底移除分页干扰(移除 `---``Page X` 标记),实现跨页内容自然合并。
- **智能标题层级**: 自动锁定文档总标题,智能识别并合并跨行长标题,避免封面期标题碎片化。
- **自动平衡**: 根据文档内容自动提升章节标题层级,确保 Markdown 目录结构平衡。
- **鲁棒表格识别**: 调优列间距算法并增加长句过滤,大幅降低 PDF 误判表格的概率。
- **转换能力全面对齐**:
- **PPTX**: 每一页幻灯片标题现在统一识别为 `#` 一级标题,优化页面视觉分层。
- **DOCX**: 重构 XML 解析,完美支持 Word 原生表格 (`w:tbl`) 转换为 Markdown 表格。
- **XLSX**: 提升 Sheet 名称为 `#` 标题。
- **工程化增强**: 新增 `test_res/testmd.sh` 自动化验证脚本,覆盖 4 种主流办公格式的 Markdown 转换质量。
- **接口一致性**: 强制所有格式 Markdown 输出从一级标题 (`#`) 开始。
## v1.0.9 (2026-05-17)
- **新特性**: 内置功能完整的命令行工具 `doc` (`cmd/doc`)。
- **功能增强**: `doc` 支持创建、打开、查看预览Markdown、转换为 JSON、查看元数据Inspect以及数据注入Excel
- **文档优化**: README 增加 `doc` 命令行工具的安装与使用指南。
## v1.0.8 (2026-05-15)
- **基础设施对齐**: 同步更新 `go/cast``go/file` 至最新版本。
- **功能修复**: 修复 Excel 单元格解析在某些边界情况下的偏差。
## v1.0.7 (2026-05-14)
- **依赖同步**: 对齐基础设施版本。
## v1.0.6 (2026-05-13) ## v1.0.6 (2026-05-13)
- **新特性**: 支持 `.csv` 格式,支持对象数组与 Markdown 表格转换。 - **新特性**: 支持 `.csv` 格式,支持对象数组与 Markdown 表格转换。
- **新特性**: 支持 `.md``.txt` 格式,统一纳入 `Document` 接口管理。 - **新特性**: 支持 `.md``.txt` 格式,统一纳入 `Document` 接口管理。

View File

@ -72,37 +72,6 @@ mdStr := g.ToMarkdown() // 包含 Mermaid graph TD 的渲染内容
- `ToMarkdown() string` - `ToMarkdown() string`
- `Save(filename ...string) error` - `Save(filename ...string) error`
## 命令行工具 (doc)
`document` 包内置了一个强大的命令行工具 `doc`,位于 `cmd/doc` 目录下。
### 安装
使用 `go install` 安装,生成的二进制文件名即为 `doc`
```bash
go install apigo.cc/go/document/cmd/doc@latest
```
### 常用命令
```bash
# 1. 预览 Excel/Word/PDF 内容 (默认输出 Markdown)
doc report.xlsx
# 2. 将文档转为结构化 JSON (适合 RAG 或自动化脚本)
doc manual.docx --json
# 3. 提取 PDF 内容并保存为 Markdown 文件
doc paper.pdf -o paper.md
# 4. 向已有的 Excel 注入数据 (支持追加或覆盖)
doc --data '[{"id":1,"name":"Alice"}]' -o test.xlsx test.xlsx
# 5. 查看文档元数据 (如工作表名、PDF 页数等)
doc paper.pdf --inspect
```
### 帮助信息
运行 `doc --help` 查看完整参数说明。
### Graph 专用 (关系型文档) ### Graph 专用 (关系型文档)
- `AddNode(n *Node)` - `AddNode(n *Node)`
- `OpenGraph(filename string) (*Graph, error)` - `OpenGraph(filename string) (*Graph, error)`

View File

@ -1,159 +0,0 @@
package main
import (
"flag"
"fmt"
"os"
"strings"
"apigo.cc/go/cast"
"apigo.cc/go/document"
)
var (
jsonOut = flag.Bool("json", false, "以 JSON 格式输出文档内容")
mdOut = flag.Bool("md", false, "以 Markdown 格式输出文档内容 (默认模式)")
savePath = flag.String("o", "", "保存结果到指定文件路径 (如: output.xlsx, content.md)")
createType = flag.String("create", "", "创建新文档,支持类型: xlsx, csv, graph, md")
password = flag.String("password", "", "访问加密文档所需的密码 (主要针对 Excel)")
sheetName = flag.String("sheet", "", "操作 Excel 时指定的工作表名称或索引 (0, 1...)")
dataStr = flag.String("data", "", "注入数据的 JSON 字符串 (支持对象数组或单个对象)")
inspect = flag.Bool("inspect", false, "只查看文档元数据 (如类型、页数、工作表列表等)")
version = flag.Bool("v", false, "显示版本信息")
)
const docVersion = "1.0.0"
func main() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "🗂️ Document CLI (doc) - 极简办公文档处理工具 v%s\n\n", docVersion)
fmt.Fprintf(os.Stderr, "用法:\n")
fmt.Fprintf(os.Stderr, " doc [flags] [file] # 处理已有文件\n")
fmt.Fprintf(os.Stderr, " doc --create [type] [flags] # 创建新文档\n\n")
fmt.Fprintf(os.Stderr, "常见示例:\n")
fmt.Fprintf(os.Stderr, " doc report.xlsx # 预览 Excel 内容 (Markdown 表格)\n")
fmt.Fprintf(os.Stderr, " doc manual.docx --json # 提取 Word 内容为结构化 JSON\n")
fmt.Fprintf(os.Stderr, " doc paper.pdf -o text.md # 提取 PDF 文字并存为 Markdown\n")
fmt.Fprintf(os.Stderr, " doc --create xlsx -o n.xlsx # 创建空白 Excel\n")
fmt.Fprintf(os.Stderr, " doc test.xlsx --data '[{\"ID\":1}]' -o test.xlsx # 向 Excel 追加数据\n\n")
fmt.Fprintf(os.Stderr, "参数详解:\n")
flag.PrintDefaults()
fmt.Fprintf(os.Stderr, "\n支持的格式:\n")
fmt.Fprintf(os.Stderr, " Excel (.xlsx), Word (.docx), PDF (.pdf), PPT (.pptx), CSV (.csv), Graph (.graph), Markdown (.md)\n")
}
flag.Parse()
if *version {
fmt.Printf("doc version %s\n", docVersion)
return
}
args := flag.Args()
var doc document.Document
var err error
// 1. 获取文档实例
if *createType != "" {
doc, err = document.Create(*createType)
if err != nil {
fail("创建文档失败: %v", err)
}
} else if len(args) > 0 {
filename := args[0]
if *password != "" {
doc, err = document.Open(filename, *password)
} else {
doc, err = document.Open(filename)
}
if err != nil {
fail("无法打开文件 '%s': %v", filename, err)
}
} else {
flag.Usage()
return
}
// 2. 数据注入逻辑
if *dataStr != "" {
applyData(doc, *dataStr, *sheetName)
}
// 3. 执行核心操作
if *inspect {
runInspect(doc)
return
}
if *savePath != "" {
if err := doc.Save(*savePath); err != nil {
fail("保存失败: %v", err)
}
fmt.Printf("✨ 成功保存至: %s\n", *savePath)
} else {
outputContent(doc, *jsonOut)
}
}
func applyData(doc document.Document, dataStr, sheet string) {
var data []map[string]any
if err := cast.UnmarshalJSON(dataStr, &data); err != nil {
var single map[string]any
if err2 := cast.UnmarshalJSON(dataStr, &single); err2 == nil {
data = []map[string]any{single}
} else {
fail("数据格式无效,请提供有效的 JSON 对象或数组: %v", err)
}
}
switch d := doc.(type) {
case *document.Excel:
if err := d.SetData(sheet, data, "A1", ""); err != nil {
fail("写入 Excel 失败: %v", err)
}
case *document.Graph:
fmt.Println("⚠️ 提示: Graph 类型目前主要通过 API 操作,暂不支持通过 CLI 批量 SetData。")
default:
fmt.Printf("⚠️ 警告: 当前文档类型 (%T) 不支持数据注入操作。\n", d)
}
}
func runInspect(doc document.Document) {
fmt.Printf("🔍 文档详情:\n")
fmt.Printf(" 类型: %T\n", doc)
switch d := doc.(type) {
case *document.Excel:
fmt.Printf(" 工作表: %s\n", strings.Join(d.Sheets(), ", "))
case *document.PDF:
if pages, ok := d.Metadata["pages"]; ok {
fmt.Printf(" 总页数: %v\n", pages)
}
for k, v := range d.Metadata {
if k != "pages" {
fmt.Printf(" %s: %v\n", k, v)
}
}
}
}
func outputContent(doc document.Document, asJSON bool) {
if asJSON {
fmt.Println(doc.ToJSON())
} else {
content := doc.ToMarkdown()
if content == "" {
fmt.Println("(文档内容为空)")
} else {
fmt.Println(content)
}
}
}
func fail(format string, a ...any) {
fmt.Fprintf(os.Stderr, "❌ 错误: "+format+"\n", a...)
os.Exit(1)
}

77
docx.go
View File

@ -45,7 +45,7 @@ func OpenDocx(filename string) (*Docx, error) {
return d, nil return d, nil
} }
// extractMarkdown 尝试从 docx 的 XML 中提取带标题和表格的 Markdown。 // extractMarkdown 尝试从 docx 的 XML 中提取带标题的 Markdown。
func (d *Docx) extractMarkdown(filename string) (string, error) { func (d *Docx) extractMarkdown(filename string) (string, error) {
r, err := zip.OpenReader(filename) r, err := zip.OpenReader(filename)
if err != nil { if err != nil {
@ -69,10 +69,6 @@ func (d *Docx) extractMarkdown(filename string) (string, error) {
var sb strings.Builder var sb strings.Builder
var inT bool var inT bool
var currentStyle string var currentStyle string
var inTable bool
var tableRows [][]string
var currentRow []string
var cellText strings.Builder
for { for {
t, err := decoder.Token() t, err := decoder.Token()
@ -85,56 +81,29 @@ func (d *Docx) extractMarkdown(filename string) (string, error) {
switch se := t.(type) { switch se := t.(type) {
case xml.StartElement: case xml.StartElement:
switch se.Name.Local { if se.Name.Local == "p" {
case "p":
currentStyle = "" currentStyle = ""
cellText.Reset() } else if se.Name.Local == "pStyle" {
case "pStyle":
for _, attr := range se.Attr { for _, attr := range se.Attr {
if attr.Name.Local == "val" { if attr.Name.Local == "val" {
currentStyle = attr.Value currentStyle = attr.Value
} }
} }
case "t": } else if se.Name.Local == "t" {
inT = true inT = true
case "tbl":
inTable = true
tableRows = nil
case "tr":
currentRow = nil
case "tc":
cellText.Reset()
} }
case xml.EndElement: case xml.EndElement:
switch se.Name.Local { if se.Name.Local == "p" {
case "p":
if inTable {
// Paragraph inside table cell is handled by cellEnd
} else {
sb.WriteString("\n")
}
case "t":
inT = false
case "tc":
currentRow = append(currentRow, strings.TrimSpace(cellText.String()))
cellText.Reset()
case "tr":
tableRows = append(tableRows, currentRow)
case "tbl":
inTable = false
sb.WriteString(renderMarkdownTable(tableRows))
sb.WriteString("\n") sb.WriteString("\n")
} else if se.Name.Local == "t" {
inT = false
} }
case xml.CharData: case xml.CharData:
text := string(se) if inT {
if inTable { text := string(se)
cellText.WriteString(text) if strings.Contains(strings.ToLower(currentStyle), "heading") {
} else if inT {
if strings.Contains(strings.ToLower(currentStyle), "heading") ||
strings.Contains(strings.ToLower(currentStyle), "title") ||
strings.Contains(strings.ToLower(currentStyle), "subject") {
level := "1" level := "1"
if strings.Contains(strings.ToLower(currentStyle), "heading") && len(currentStyle) > 7 { if len(currentStyle) > 7 {
level = currentStyle[7:] level = currentStyle[7:]
} }
l := cast.To[int](level) l := cast.To[int](level)
@ -153,30 +122,6 @@ func (d *Docx) extractMarkdown(filename string) (string, error) {
return strings.TrimSpace(sb.String()), nil return strings.TrimSpace(sb.String()), nil
} }
func renderMarkdownTable(rows [][]string) string {
if len(rows) == 0 {
return ""
}
var sb strings.Builder
sb.WriteString("\n")
for i, row := range rows {
sb.WriteString("| ")
for _, col := range row {
sb.WriteString(strings.ReplaceAll(col, "|", "\\|"))
sb.WriteString(" | ")
}
sb.WriteString("\n")
if i == 0 {
sb.WriteString("|")
for range row {
sb.WriteString(" --- |")
}
sb.WriteString("\n")
}
}
return sb.String()
}
// ToJSON 返回包含元数据和内容的 JSON 字符串。 // ToJSON 返回包含元数据和内容的 JSON 字符串。
func (d *Docx) ToJSON() string { func (d *Docx) ToJSON() string {
res, _ := cast.ToJSON(map[string]any{ res, _ := cast.ToJSON(map[string]any{

View File

@ -87,7 +87,7 @@ func (xls *Excel) ToMarkdown() string {
} }
if len(sheets) > 1 { if len(sheets) > 1 {
sb.WriteString("# Sheet: " + sheetName + "\n\n") sb.WriteString("## Sheet: " + sheetName + "\n\n")
} }
for i, row := range rows { for i, row := range rows {

15
go.mod
View File

@ -3,18 +3,17 @@ module apigo.cc/go/document
go 1.25.0 go 1.25.0
require ( require (
apigo.cc/go/cast v1.5.0 apigo.cc/go/cast v1.3.2
apigo.cc/go/file v1.5.0 apigo.cc/go/file v1.3.1
apigo.cc/go/jsmod v1.5.0
github.com/dslipak/pdf v0.0.2 github.com/dslipak/pdf v0.0.2
github.com/xuri/excelize/v2 v2.10.1 github.com/xuri/excelize/v2 v2.10.1
github.com/young2j/oxmltotext v1.0.3 github.com/young2j/oxmltotext v1.0.3
) )
require ( require (
apigo.cc/go/encoding v1.5.0 // indirect apigo.cc/go/encoding v1.3.0 // indirect
apigo.cc/go/rand v1.5.0 // indirect apigo.cc/go/rand v1.3.0 // indirect
apigo.cc/go/safe v1.5.0 // indirect apigo.cc/go/safe v1.3.0 // indirect
github.com/andybalholm/brotli v1.0.5 // indirect github.com/andybalholm/brotli v1.0.5 // indirect
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c // indirect github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c // indirect
github.com/klauspost/compress v1.17.0 // indirect github.com/klauspost/compress v1.17.0 // indirect
@ -28,10 +27,10 @@ require (
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect
go.uber.org/multierr v1.10.0 // indirect go.uber.org/multierr v1.10.0 // indirect
go.uber.org/zap v1.26.0 // indirect go.uber.org/zap v1.26.0 // indirect
golang.org/x/crypto v0.52.0 // indirect golang.org/x/crypto v0.51.0 // indirect
golang.org/x/image v0.40.0 // indirect golang.org/x/image v0.40.0 // indirect
golang.org/x/net v0.54.0 // indirect golang.org/x/net v0.54.0 // indirect
golang.org/x/sys v0.45.0 // indirect golang.org/x/sys v0.44.0 // indirect
golang.org/x/text v0.37.0 // indirect golang.org/x/text v0.37.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )

28
go.sum
View File

@ -1,15 +1,13 @@
apigo.cc/go/cast v1.5.0 h1:UBGJtFQ8eJPMQXs37cUgqd7YQo1zI9opuSDBDmn2/pE= apigo.cc/go/cast v1.3.2 h1:hh9MWDSwh3T/kQdCHjFpjDwHrh2A05Q4wt1AAWs8NBI=
apigo.cc/go/cast v1.5.0/go.mod h1:z2GW5p5WCZGEqVVIJUdhl232vRbLf2Qu4EDlEakX/D8= apigo.cc/go/cast v1.3.2/go.mod h1:lGlwImiOvHxG7buyMWhFzcdvQzmSaoKbmr7bcDfUpHk=
apigo.cc/go/encoding v1.5.0 h1:EJNdRVDOMoI2DAvZwQNQTbYuqB/6zsEzvg7lS5pQI+I= apigo.cc/go/encoding v1.3.0 h1:8jqNHoZBR8vOU/BGsLFebfp1Txa1UxDRpd7YwzIFLJs=
apigo.cc/go/encoding v1.5.0/go.mod h1:8++NfZj3hWig0qh2g7GQRw/4LpSvCYMWUZ+8J+x58cA= apigo.cc/go/encoding v1.3.0/go.mod h1:kT/uUJiuAOkZ4LzUWrUtk/I0iL1D8aatvD+59bDnHBo=
apigo.cc/go/file v1.5.0 h1:Fh1NSDBqaxjuXYJ71yPHPXVJ8BFEv/AGS3l+jkLi5uw= apigo.cc/go/file v1.3.1 h1:qHgiJsn1K9DazWRrPoHVnXtp6hDGGsUpAE/4G1bFXqY=
apigo.cc/go/file v1.5.0/go.mod h1:4YhOGgBINTpmmmgws3H8LAyXQQBGzBp44hYUoCS+kr0= apigo.cc/go/file v1.3.1/go.mod h1:pYHBlB/XwsrnWpEh7GIFpbiqobrExfiB+rEN8V2d2kY=
apigo.cc/go/jsmod v1.5.0 h1:JgQtJNiJWy1NOP9AzE8NX5VXJkpO/x3GqLsCCSny5Ec= apigo.cc/go/rand v1.3.0 h1:k+UFAhMySwXf+dq8Om9TniZV6fm6gAE0evbrqMEdwQU=
apigo.cc/go/jsmod v1.5.0/go.mod h1:bmyeZtOAP/j5am+YRnaiM89smysK24K7ebk0koFtsSw= apigo.cc/go/rand v1.3.0/go.mod h1:mZ/4Soa3bk+XvDaqPWJuUe1bfEi4eThBj1XmEAuYxsk=
apigo.cc/go/rand v1.5.0 h1:1o8hh8fhdBuk1/h02IvugvamuT3dkWbVJrqEJVQKB2E= apigo.cc/go/safe v1.3.0 h1:uctdAUsphT9p60Tk4oS5xPCe0NoIdOHfsYv4PNS0Rok=
apigo.cc/go/rand v1.5.0/go.mod h1:Lh98S2dm9UY0X+M+kNQQEKyXHG5pcCKSFPyXN0QCGdk= apigo.cc/go/safe v1.3.0/go.mod h1:tC9X14V+qh0BqIrVg4UkXbl+2pEN+lj2ZNI8IjDB6Fs=
apigo.cc/go/safe v1.5.0 h1:W1NblmcU8cex1f9Y5z8mNLUJOzZTE1s6fszb3FbhGnk=
apigo.cc/go/safe v1.5.0/go.mod h1:OfQ5d6COePSGEuPvMeOk6KagX2sezw7nvKh7exj9SeM=
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@ -58,12 +56,14 @@ go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
golang.org/x/image v0.40.0 h1:Tw4GyDXMo+daZN1znreBRC3VayR1aLFUyUEOLUdW1a8= golang.org/x/image v0.40.0 h1:Tw4GyDXMo+daZN1znreBRC3VayR1aLFUyUEOLUdW1a8=
golang.org/x/image v0.40.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA= golang.org/x/image v0.40.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA=
golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w= golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w=
golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ= golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ=
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=

View File

@ -1,98 +0,0 @@
package document
import (
"context"
"apigo.cc/go/file"
"apigo.cc/go/jsmod"
)
func init() {
jsmod.Register("document", map[string]any{
"Open": func(ctx context.Context, filename string, password *string) (*jsDocument, error) {
p, err := file.VerifyPathForSafeMode(ctx, filename)
if err != nil {
return nil, err
}
var doc Document
if password != nil {
doc, err = Open(p, *password)
} else {
doc, err = Open(p)
}
if err != nil {
return nil, err
}
return &jsDocument{ctx: ctx, d: doc}, nil
},
"Create": func(ctx context.Context, ext string) (*jsDocument, error) {
doc, err := Create(ext)
if err != nil {
return nil, err
}
return &jsDocument{ctx: ctx, d: doc}, nil
},
"NewExcel": func(ctx context.Context) *jsDocument {
return &jsDocument{ctx: ctx, d: NewExcel()}
},
"NewGraph": func(ctx context.Context) *jsDocument {
return &jsDocument{ctx: ctx, d: NewGraph()}
},
})
}
type jsDocument struct {
ctx context.Context
d Document
}
func (j *jsDocument) ToJSON() string { return j.d.ToJSON() }
func (j *jsDocument) ToMarkdown() string { return j.d.ToMarkdown() }
func (j *jsDocument) Save(filename *string) error {
var targetPath string
if filename != nil && *filename != "" {
p, err := file.VerifyPathForSafeMode(j.ctx, *filename)
if err != nil {
return err
}
targetPath = p
}
return j.d.Save(targetPath)
}
// Excel 增强方法 (如果底层是 Excel)
func (j *jsDocument) Get(sheetName string, start, end string) ([][]any, error) {
if x, ok := j.d.(*Excel); ok {
return x.Get(sheetName, start, end)
}
return nil, nil
}
func (j *jsDocument) GetData(sheetName string, start, end string) ([]map[string]any, error) {
if x, ok := j.d.(*Excel); ok {
return x.GetData(sheetName, start, end)
}
return nil, nil
}
func (j *jsDocument) Set(sheetName string, table [][]any, start, end string) error {
if x, ok := j.d.(*Excel); ok {
return x.Set(sheetName, table, start, end)
}
return nil
}
func (j *jsDocument) SetData(sheetName string, data []map[string]any, start, end string) error {
if x, ok := j.d.(*Excel); ok {
return x.SetData(sheetName, data, start, end)
}
return nil
}
func (j *jsDocument) Sheets() []string {
if x, ok := j.d.(*Excel); ok {
return x.Sheets()
}
return nil
}

366
pdf.go
View File

@ -2,7 +2,6 @@ package document
import ( import (
"fmt" "fmt"
"sort"
"strings" "strings"
"apigo.cc/go/cast" "apigo.cc/go/cast"
@ -28,355 +27,42 @@ func OpenPDF(filename string) (*PDF, error) {
} }
f, err := pdf.Open(filename) f, err := pdf.Open(filename)
if err != nil { if err == nil {
return nil, err p.Metadata["pages"] = f.NumPage()
} trailer := f.Trailer()
infoDict := trailer.Key("Info")
p.Metadata["pages"] = f.NumPage() if !infoDict.IsNull() {
trailer := f.Trailer() for _, field := range infoDict.Keys() {
infoDict := trailer.Key("Info") val := infoDict.Key(field).Text()
if !infoDict.IsNull() { if val != "" {
for _, field := range infoDict.Keys() { p.Metadata[strings.ToLower(field)] = val
val := infoDict.Key(field).Text()
if val != "" {
p.Metadata[strings.ToLower(field)] = val
}
}
}
// 收集所有页面的 block
type Block struct {
Type string
Level int
Text string
FontSize float64
}
var allBlocks []Block
for i := 1; i <= f.NumPage(); i++ {
page := f.Page(i)
if page.V.IsNull() {
continue
}
content := page.Content()
texts := content.Text
if len(texts) == 0 {
continue
}
// 1. 估算正文字体大小(众数)
fontSizes := make(map[int]int)
for _, t := range texts {
fontSizes[int(t.FontSize)]++
}
bodySize := 0
maxCount := 0
for size, count := range fontSizes {
if count > maxCount {
maxCount = count
bodySize = size
}
}
// 2. 按行分组(基于 Y 坐标)
type Line struct {
Y float64
MaxFontSize float64
Text string
Texts []pdf.Text
}
var lines []Line
for _, t := range texts {
found := false
for i := range lines {
if t.Y > lines[i].Y-2 && t.Y < lines[i].Y+2 {
lines[i].Texts = append(lines[i].Texts, t)
found = true
break
} }
} }
if !found {
lines = append(lines, Line{Y: t.Y, Texts: []pdf.Text{t}})
}
} }
sort.Slice(lines, func(i, j int) bool { var sb strings.Builder
return lines[i].Y > lines[j].Y for i := 1; i <= f.NumPage(); i++ {
}) p_ := f.Page(i)
if p_.V.IsNull() {
for i := range lines {
sort.Slice(lines[i].Texts, func(m, n int) bool {
return lines[i].Texts[m].X < lines[i].Texts[n].X
})
var sb strings.Builder
maxFS := 0.0
for _, t := range lines[i].Texts {
sb.WriteString(t.S)
if t.FontSize > maxFS {
maxFS = t.FontSize
}
}
lines[i].Text = strings.TrimSpace(sb.String())
lines[i].MaxFontSize = maxFS
}
// 3. 语义块识别
for j := 0; j < len(lines); j++ {
line := lines[j]
if line.Text == "" {
continue continue
} }
t := p_.Content().Text
isTableLine, cells := p.identifyTableLine(line) if len(t) > 0 {
if isTableLine { if i > 1 {
tableStr := "| " + strings.Join(cells, " | ") + " |" sb.WriteString("\n\n")
allBlocks = append(allBlocks, Block{Type: "table", Text: tableStr})
continue
}
if line.MaxFontSize > float64(bodySize)+1 {
level := 1
if line.MaxFontSize < float64(bodySize)+4 {
level = 3
} else if line.MaxFontSize < float64(bodySize)+8 {
level = 2
} }
sb.WriteString(fmt.Sprintf("<!-- Page %d -->\n", i))
fullText := line.Text for _, text := range t {
lastY := line.Y sb.WriteString(text.S)
for k := j + 1; k < len(lines); k++ {
if lines[k].Text != "" && MathAbs(lines[k].MaxFontSize-line.MaxFontSize) < 1.0 {
if MathAbs(lines[k].Y-lastY) < 25 {
fullText += " " + lines[k].Text
lastY = lines[k].Y
j = k
} else {
break
}
} else {
break
}
} }
if j < 5 && i == 1 {
if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
continue
}
}
if len(allBlocks) > 0 && allBlocks[len(allBlocks)-1].Type == "heading" && allBlocks[len(allBlocks)-1].Level == level {
allBlocks[len(allBlocks)-1].Text += " " + fullText
} else {
allBlocks = append(allBlocks, Block{Type: "heading", Level: level, Text: fullText})
}
} else {
if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
continue
}
fullText := line.Text
lastY := line.Y
for k := j + 1; k < len(lines); k++ {
if lines[k].Text != "" && lines[k].MaxFontSize <= float64(bodySize)+1 {
isT, _ := p.identifyTableLine(lines[k])
if isT {
break
}
if (lines[k].Y < 50 || lines[k].Y > 800) && isPageNumber(lines[k].Text) {
break
}
if MathAbs(lines[k].Y-lastY) > 25 {
break
}
fullText += lines[k].Text
lastY = lines[k].Y
j = k
if isPunctuation(lines[k].Text) {
break
}
} else {
break
}
}
allBlocks = append(allBlocks, Block{Type: "paragraph", Text: fullText})
} }
} }
p.Content = strings.TrimSpace(sb.String())
} }
// 4. 智能封面/标题逻辑 (全局)
h1Count := 0
for _, b := range allBlocks {
if b.Type == "heading" && b.Level == 1 {
h1Count++
}
}
shouldPromote := h1Count <= 1
contentStarted := false
hasMetadataTitle := false
if t, ok := p.Metadata["title"].(string); ok && t != "" {
hasMetadataTitle = true
}
firstHeadingProcessed := false
var sb strings.Builder
for _, b := range allBlocks {
if b.Type == "heading" {
level := b.Level
if !contentStarted {
if !firstHeadingProcessed {
firstHeadingProcessed = true
// 如果有 Metadata Title则 Metadata Title 充当了真正的第一级标题
// 我们把遇到的第一个大标题降级为正文文本(除非它已经是标准章节)
if hasMetadataTitle && !isStandardSection(b.Text) {
sb.WriteString("\n" + b.Text + "\n\n")
continue
} else {
// 否则作为文档的主标题
sb.WriteString("\n# " + b.Text + "\n\n")
continue
}
} else {
if isStandardSection(b.Text) {
contentStarted = true
sb.WriteString("\n# " + b.Text + "\n\n")
continue
}
// 否则作为封面副标题/文本
sb.WriteString(b.Text + "\n\n")
continue
}
}
if shouldPromote {
if level == 2 {
level = 1
} else if level == 3 {
level = 2
}
}
sb.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
} else if b.Type == "paragraph" {
contentStarted = true
sb.WriteString(b.Text + "\n\n")
} else if b.Type == "table" {
contentStarted = true
sb.WriteString(b.Text + "\n")
}
}
p.Content = strings.TrimSpace(sb.String())
return p, nil return p, nil
} }
func isStandardSection(s string) bool {
s = strings.TrimSpace(s)
// 常见的章节开头关键词
standards := []string{"引言", "摘要", "目录", "前言", "结论", "背景", "概述", "Introduction", "Abstract", "Conclusion", "Summary"}
for _, std := range standards {
if strings.Contains(s, std) {
return true
}
}
// 如果标题带有数字编号且较短,也认为是章节开始
if len([]rune(s)) < 20 {
if s[0] >= '0' && s[0] <= '9' {
return true
}
if strings.HasPrefix(s, "第") && (strings.Contains(s, "章") || strings.Contains(s, "节") || strings.Contains(s, "部分")) {
return true
}
}
return false
}
func isPageNumber(s string) bool {
s = strings.TrimSpace(s)
if s == "" {
return false
}
isNum := true
for _, r := range s {
if r < '0' || r > '9' {
isNum = false
break
}
}
if isNum {
return true
}
lower := strings.ToLower(s)
return strings.HasPrefix(lower, "page") || (strings.HasPrefix(s, "-") && strings.HasSuffix(s, "-"))
}
func isPunctuation(s string) bool {
if s == "" {
return false
}
runes := []rune(s)
last := runes[len(runes)-1]
return strings.ContainsRune("。.!?:", last)
}
func (p *PDF) identifyTableLine(line struct {
Y float64
MaxFontSize float64
Text string
Texts []pdf.Text
}) (bool, []string) {
if len(line.Texts) < 3 {
return false, nil
}
var cells []string
var currentCell strings.Builder
lastX := -1.0
for _, t := range line.Texts {
if lastX != -1.0 && t.X-(lastX) > 40 {
content := strings.TrimSpace(currentCell.String())
if content != "" {
cells = append(cells, content)
}
currentCell.Reset()
}
currentCell.WriteString(t.S)
lastX = t.X + t.W
}
finalCell := strings.TrimSpace(currentCell.String())
if finalCell != "" {
cells = append(cells, finalCell)
}
if len(cells) >= 2 {
allSingleChar := true
for _, c := range cells {
r := []rune(c)
if len(r) > 1 {
allSingleChar = false
}
if len(r) > 40 {
return false, nil
}
}
if allSingleChar {
return false, nil
}
return true, cells
}
return false, nil
}
func MathAbs(v float64) float64 {
if v < 0 {
return -v
}
return v
}
// ToJSON 返回结构化 JSON。 // ToJSON 返回结构化 JSON。
func (p *PDF) ToJSON() string { func (p *PDF) ToJSON() string {
res, _ := cast.ToJSON(map[string]any{ res, _ := cast.ToJSON(map[string]any{
@ -388,15 +74,7 @@ func (p *PDF) ToJSON() string {
// ToMarkdown 返回 Markdown。 // ToMarkdown 返回 Markdown。
func (p *PDF) ToMarkdown() string { func (p *PDF) ToMarkdown() string {
if p.Content == "" { return p.Content
return ""
}
var sb strings.Builder
if title, ok := p.Metadata["title"]; ok && title != "" {
sb.WriteString("# " + cast.To[string](title) + "\n\n")
}
sb.WriteString(p.Content)
return sb.String()
} }
// Save 保存(目前保存为提取后的文本)。 // Save 保存(目前保存为提取后的文本)。

37
pptx.go
View File

@ -3,7 +3,6 @@ package document
import ( import (
"fmt" "fmt"
"io" "io"
"strings"
"apigo.cc/go/cast" "apigo.cc/go/cast"
"apigo.cc/go/file" "apigo.cc/go/file"
@ -47,41 +46,7 @@ func (p *Pptx) ToJSON() string {
// ToMarkdown 返回 Markdown。 // ToMarkdown 返回 Markdown。
func (p *Pptx) ToMarkdown() string { func (p *Pptx) ToMarkdown() string {
if p.Content == "" { return p.Content
return ""
}
lines := strings.Split(p.Content, "\n")
var res []string
nextIsMainTitle := true
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
// 检查幻灯片分隔符
if strings.Contains(line, "--------------------------------") {
res = append(res, "\n---")
nextIsMainTitle = true
continue
}
// 启发式识别标题:行短、无句末标点
if len([]rune(trimmed)) < 50 && !strings.HasSuffix(trimmed, ".") && !strings.HasSuffix(trimmed, "。") && !strings.HasSuffix(trimmed, ":") && !strings.HasSuffix(trimmed, "") {
if nextIsMainTitle {
res = append(res, "\n# "+trimmed)
nextIsMainTitle = false
} else {
res = append(res, "\n## "+trimmed)
}
} else {
res = append(res, trimmed)
// 如果该页已经有了主标题,后续的长文本不会重置标题状态
}
}
return strings.TrimSpace(strings.Join(res, "\n"))
} }
// Save 保存文档(目前保存为提取后的文本)。 // Save 保存文档(目前保存为提取后的文本)。