Compare commits
No commits in common. "main" and "v1.0.9" have entirely different histories.
18
CHANGELOG.md
18
CHANGELOG.md
@ -1,23 +1,5 @@
|
|||||||
# CHANGELOG
|
# CHANGELOG
|
||||||
|
|
||||||
## v1.5.1 (2026-06-08)
|
|
||||||
- **JS 对齐 & 智能文档**:
|
|
||||||
- 将所有注册到 `jsmod` 的方法名统一为 PascalCase。
|
|
||||||
- **可选参数优化**: 将 `Open` 的 `password` 和 `Save` 的 `filename` 改为指针类型。配合最新的 `go/js` 引擎,生成的 `.d.ts` 定义将正确提示为可选参数 `?`,极大优化了 AI 编码体验。
|
|
||||||
|
|
||||||
## v1.1.0 (2026-05-17)
|
|
||||||
- **PDF 语义重构**: 引入全局语义分析引擎。
|
|
||||||
- **无缝流**: 彻底移除分页干扰(移除 `---` 和 `Page X` 标记),实现跨页内容自然合并。
|
|
||||||
- **智能标题层级**: 自动锁定文档总标题,智能识别并合并跨行长标题,避免封面期标题碎片化。
|
|
||||||
- **自动平衡**: 根据文档内容自动提升章节标题层级,确保 Markdown 目录结构平衡。
|
|
||||||
- **鲁棒表格识别**: 调优列间距算法并增加长句过滤,大幅降低 PDF 误判表格的概率。
|
|
||||||
- **转换能力全面对齐**:
|
|
||||||
- **PPTX**: 每一页幻灯片标题现在统一识别为 `#` 一级标题,优化页面视觉分层。
|
|
||||||
- **DOCX**: 重构 XML 解析,完美支持 Word 原生表格 (`w:tbl`) 转换为 Markdown 表格。
|
|
||||||
- **XLSX**: 提升 Sheet 名称为 `#` 标题。
|
|
||||||
- **工程化增强**: 新增 `test_res/testmd.sh` 自动化验证脚本,覆盖 4 种主流办公格式的 Markdown 转换质量。
|
|
||||||
- **接口一致性**: 强制所有格式 Markdown 输出从一级标题 (`#`) 开始。
|
|
||||||
|
|
||||||
## v1.0.9 (2026-05-17)
|
## v1.0.9 (2026-05-17)
|
||||||
- **新特性**: 内置功能完整的命令行工具 `doc` (`cmd/doc`)。
|
- **新特性**: 内置功能完整的命令行工具 `doc` (`cmd/doc`)。
|
||||||
- **功能增强**: `doc` 支持创建、打开、查看预览(Markdown)、转换为 JSON、查看元数据(Inspect)以及数据注入(Excel)。
|
- **功能增强**: `doc` 支持创建、打开、查看预览(Markdown)、转换为 JSON、查看元数据(Inspect)以及数据注入(Excel)。
|
||||||
|
|||||||
15
go.mod
15
go.mod
@ -3,18 +3,17 @@ module apigo.cc/go/document
|
|||||||
go 1.25.0
|
go 1.25.0
|
||||||
|
|
||||||
require (
|
require (
|
||||||
apigo.cc/go/cast v1.5.0
|
apigo.cc/go/cast v1.3.3
|
||||||
apigo.cc/go/file v1.5.0
|
apigo.cc/go/file v1.3.2
|
||||||
apigo.cc/go/jsmod v1.5.0
|
|
||||||
github.com/dslipak/pdf v0.0.2
|
github.com/dslipak/pdf v0.0.2
|
||||||
github.com/xuri/excelize/v2 v2.10.1
|
github.com/xuri/excelize/v2 v2.10.1
|
||||||
github.com/young2j/oxmltotext v1.0.3
|
github.com/young2j/oxmltotext v1.0.3
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
apigo.cc/go/encoding v1.5.0 // indirect
|
apigo.cc/go/encoding v1.3.1 // indirect
|
||||||
apigo.cc/go/rand v1.5.0 // indirect
|
apigo.cc/go/rand v1.3.1 // indirect
|
||||||
apigo.cc/go/safe v1.5.0 // indirect
|
apigo.cc/go/safe v1.3.1 // indirect
|
||||||
github.com/andybalholm/brotli v1.0.5 // indirect
|
github.com/andybalholm/brotli v1.0.5 // indirect
|
||||||
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c // indirect
|
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c // indirect
|
||||||
github.com/klauspost/compress v1.17.0 // indirect
|
github.com/klauspost/compress v1.17.0 // indirect
|
||||||
@ -28,10 +27,10 @@ require (
|
|||||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect
|
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect
|
||||||
go.uber.org/multierr v1.10.0 // indirect
|
go.uber.org/multierr v1.10.0 // indirect
|
||||||
go.uber.org/zap v1.26.0 // indirect
|
go.uber.org/zap v1.26.0 // indirect
|
||||||
golang.org/x/crypto v0.52.0 // indirect
|
golang.org/x/crypto v0.51.0 // indirect
|
||||||
golang.org/x/image v0.40.0 // indirect
|
golang.org/x/image v0.40.0 // indirect
|
||||||
golang.org/x/net v0.54.0 // indirect
|
golang.org/x/net v0.54.0 // indirect
|
||||||
golang.org/x/sys v0.45.0 // indirect
|
golang.org/x/sys v0.44.0 // indirect
|
||||||
golang.org/x/text v0.37.0 // indirect
|
golang.org/x/text v0.37.0 // indirect
|
||||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
28
go.sum
28
go.sum
@ -1,15 +1,13 @@
|
|||||||
apigo.cc/go/cast v1.5.0 h1:UBGJtFQ8eJPMQXs37cUgqd7YQo1zI9opuSDBDmn2/pE=
|
apigo.cc/go/cast v1.3.3 h1:aln5eDR5DZVWVzZ/y5SJh1gQNgWv2sT82I25NaO9g34=
|
||||||
apigo.cc/go/cast v1.5.0/go.mod h1:z2GW5p5WCZGEqVVIJUdhl232vRbLf2Qu4EDlEakX/D8=
|
apigo.cc/go/cast v1.3.3/go.mod h1:lGlwImiOvHxG7buyMWhFzcdvQzmSaoKbmr7bcDfUpHk=
|
||||||
apigo.cc/go/encoding v1.5.0 h1:EJNdRVDOMoI2DAvZwQNQTbYuqB/6zsEzvg7lS5pQI+I=
|
apigo.cc/go/encoding v1.3.1 h1:y8O58KYAyulkThg1O2ji2BqjnFoSvk42sit9I3z+K7Y=
|
||||||
apigo.cc/go/encoding v1.5.0/go.mod h1:8++NfZj3hWig0qh2g7GQRw/4LpSvCYMWUZ+8J+x58cA=
|
apigo.cc/go/encoding v1.3.1/go.mod h1:xAJk5b83VZ31mXMTnyp0dfMoBKfT/AHDn0u+cQfojgY=
|
||||||
apigo.cc/go/file v1.5.0 h1:Fh1NSDBqaxjuXYJ71yPHPXVJ8BFEv/AGS3l+jkLi5uw=
|
apigo.cc/go/file v1.3.2 h1:pu4oiDyiqgj3/eykfnJf+/6+A9v/Z0b3ClP5XK+lwG4=
|
||||||
apigo.cc/go/file v1.5.0/go.mod h1:4YhOGgBINTpmmmgws3H8LAyXQQBGzBp44hYUoCS+kr0=
|
apigo.cc/go/file v1.3.2/go.mod h1:vci4h0Pz94mV6dkniQkuyBYERVYeq7/LX4jJVuCg9hs=
|
||||||
apigo.cc/go/jsmod v1.5.0 h1:JgQtJNiJWy1NOP9AzE8NX5VXJkpO/x3GqLsCCSny5Ec=
|
apigo.cc/go/rand v1.3.1 h1:7FvsI6PtQ5XrWER0dTiLVo0p7GIxRidT/TBKhVy93j8=
|
||||||
apigo.cc/go/jsmod v1.5.0/go.mod h1:bmyeZtOAP/j5am+YRnaiM89smysK24K7ebk0koFtsSw=
|
apigo.cc/go/rand v1.3.1/go.mod h1:mZ/4Soa3bk+XvDaqPWJuUe1bfEi4eThBj1XmEAuYxsk=
|
||||||
apigo.cc/go/rand v1.5.0 h1:1o8hh8fhdBuk1/h02IvugvamuT3dkWbVJrqEJVQKB2E=
|
apigo.cc/go/safe v1.3.1 h1:irTCqPAC97gGsX/Lw5AzLelDt1xXLEZIAaVhLELWe9Q=
|
||||||
apigo.cc/go/rand v1.5.0/go.mod h1:Lh98S2dm9UY0X+M+kNQQEKyXHG5pcCKSFPyXN0QCGdk=
|
apigo.cc/go/safe v1.3.1/go.mod h1:XdOpBhN2vkImalaykYXXmEpczqWa1y3ah6/Q72cdRqE=
|
||||||
apigo.cc/go/safe v1.5.0 h1:W1NblmcU8cex1f9Y5z8mNLUJOzZTE1s6fszb3FbhGnk=
|
|
||||||
apigo.cc/go/safe v1.5.0/go.mod h1:OfQ5d6COePSGEuPvMeOk6KagX2sezw7nvKh7exj9SeM=
|
|
||||||
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
|
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
|
||||||
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
|
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
@ -58,12 +56,14 @@ go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
|
|||||||
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
|
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
|
||||||
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
|
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
|
||||||
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
|
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
|
||||||
golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988=
|
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
|
||||||
|
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
|
||||||
golang.org/x/image v0.40.0 h1:Tw4GyDXMo+daZN1znreBRC3VayR1aLFUyUEOLUdW1a8=
|
golang.org/x/image v0.40.0 h1:Tw4GyDXMo+daZN1znreBRC3VayR1aLFUyUEOLUdW1a8=
|
||||||
golang.org/x/image v0.40.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA=
|
golang.org/x/image v0.40.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA=
|
||||||
golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w=
|
golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w=
|
||||||
golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ=
|
golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ=
|
||||||
golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY=
|
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
|
||||||
|
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
|
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
|
||||||
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
|
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
|||||||
98
js_export.go
98
js_export.go
@ -1,98 +0,0 @@
|
|||||||
package document
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
|
|
||||||
"apigo.cc/go/file"
|
|
||||||
"apigo.cc/go/jsmod"
|
|
||||||
)
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
jsmod.Register("document", map[string]any{
|
|
||||||
"Open": func(ctx context.Context, filename string, password *string) (*jsDocument, error) {
|
|
||||||
p, err := file.VerifyPathForSafeMode(ctx, filename)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
var doc Document
|
|
||||||
if password != nil {
|
|
||||||
doc, err = Open(p, *password)
|
|
||||||
} else {
|
|
||||||
doc, err = Open(p)
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &jsDocument{ctx: ctx, d: doc}, nil
|
|
||||||
},
|
|
||||||
"Create": func(ctx context.Context, ext string) (*jsDocument, error) {
|
|
||||||
doc, err := Create(ext)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &jsDocument{ctx: ctx, d: doc}, nil
|
|
||||||
},
|
|
||||||
"NewExcel": func(ctx context.Context) *jsDocument {
|
|
||||||
return &jsDocument{ctx: ctx, d: NewExcel()}
|
|
||||||
},
|
|
||||||
"NewGraph": func(ctx context.Context) *jsDocument {
|
|
||||||
return &jsDocument{ctx: ctx, d: NewGraph()}
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
type jsDocument struct {
|
|
||||||
ctx context.Context
|
|
||||||
d Document
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *jsDocument) ToJSON() string { return j.d.ToJSON() }
|
|
||||||
func (j *jsDocument) ToMarkdown() string { return j.d.ToMarkdown() }
|
|
||||||
|
|
||||||
func (j *jsDocument) Save(filename *string) error {
|
|
||||||
var targetPath string
|
|
||||||
if filename != nil && *filename != "" {
|
|
||||||
p, err := file.VerifyPathForSafeMode(j.ctx, *filename)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
targetPath = p
|
|
||||||
}
|
|
||||||
return j.d.Save(targetPath)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Excel 增强方法 (如果底层是 Excel)
|
|
||||||
func (j *jsDocument) Get(sheetName string, start, end string) ([][]any, error) {
|
|
||||||
if x, ok := j.d.(*Excel); ok {
|
|
||||||
return x.Get(sheetName, start, end)
|
|
||||||
}
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *jsDocument) GetData(sheetName string, start, end string) ([]map[string]any, error) {
|
|
||||||
if x, ok := j.d.(*Excel); ok {
|
|
||||||
return x.GetData(sheetName, start, end)
|
|
||||||
}
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *jsDocument) Set(sheetName string, table [][]any, start, end string) error {
|
|
||||||
if x, ok := j.d.(*Excel); ok {
|
|
||||||
return x.Set(sheetName, table, start, end)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *jsDocument) SetData(sheetName string, data []map[string]any, start, end string) error {
|
|
||||||
if x, ok := j.d.(*Excel); ok {
|
|
||||||
return x.SetData(sheetName, data, start, end)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *jsDocument) Sheets() []string {
|
|
||||||
if x, ok := j.d.(*Excel); ok {
|
|
||||||
return x.Sheets()
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
140
pdf.go
140
pdf.go
@ -44,15 +44,7 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 收集所有页面的 block
|
var sb strings.Builder
|
||||||
type Block struct {
|
|
||||||
Type string
|
|
||||||
Level int
|
|
||||||
Text string
|
|
||||||
FontSize float64
|
|
||||||
}
|
|
||||||
var allBlocks []Block
|
|
||||||
|
|
||||||
for i := 1; i <= f.NumPage(); i++ {
|
for i := 1; i <= f.NumPage(); i++ {
|
||||||
page := f.Page(i)
|
page := f.Page(i)
|
||||||
if page.V.IsNull() {
|
if page.V.IsNull() {
|
||||||
@ -65,6 +57,19 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 处理页面内容
|
||||||
|
sb.WriteString(p.processPageTexts(texts))
|
||||||
|
}
|
||||||
|
p.Content = strings.TrimSpace(sb.String())
|
||||||
|
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *PDF) processPageTexts(texts []pdf.Text) string {
|
||||||
|
if len(texts) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
// 1. 估算正文字体大小(众数)
|
// 1. 估算正文字体大小(众数)
|
||||||
fontSizes := make(map[int]int)
|
fontSizes := make(map[int]int)
|
||||||
for _, t := range texts {
|
for _, t := range texts {
|
||||||
@ -101,10 +106,12 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 按 Y 降序排列(从上到下)
|
||||||
sort.Slice(lines, func(i, j int) bool {
|
sort.Slice(lines, func(i, j int) bool {
|
||||||
return lines[i].Y > lines[j].Y
|
return lines[i].Y > lines[j].Y
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// 预处理每一行的文本和最大字体
|
||||||
for i := range lines {
|
for i := range lines {
|
||||||
sort.Slice(lines[i].Texts, func(m, n int) bool {
|
sort.Slice(lines[i].Texts, func(m, n int) bool {
|
||||||
return lines[i].Texts[m].X < lines[i].Texts[n].X
|
return lines[i].Texts[m].X < lines[i].Texts[n].X
|
||||||
@ -122,19 +129,28 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 3. 语义块识别
|
// 3. 语义块识别
|
||||||
for j := 0; j < len(lines); j++ {
|
type Block struct {
|
||||||
line := lines[j]
|
Type string // heading, paragraph, table
|
||||||
|
Level int // for heading
|
||||||
|
Text string
|
||||||
|
FontSize float64
|
||||||
|
}
|
||||||
|
var blocks []Block
|
||||||
|
for i := 0; i < len(lines); i++ {
|
||||||
|
line := lines[i]
|
||||||
if line.Text == "" {
|
if line.Text == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 表格识别逻辑
|
||||||
isTableLine, cells := p.identifyTableLine(line)
|
isTableLine, cells := p.identifyTableLine(line)
|
||||||
if isTableLine {
|
if isTableLine {
|
||||||
tableStr := "| " + strings.Join(cells, " | ") + " |"
|
tableStr := "| " + strings.Join(cells, " | ") + " |"
|
||||||
allBlocks = append(allBlocks, Block{Type: "table", Text: tableStr})
|
blocks = append(blocks, Block{Type: "table", Text: tableStr})
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 标题识别逻辑 (比正文大)
|
||||||
if line.MaxFontSize > float64(bodySize)+1 {
|
if line.MaxFontSize > float64(bodySize)+1 {
|
||||||
level := 1
|
level := 1
|
||||||
if line.MaxFontSize < float64(bodySize)+4 {
|
if line.MaxFontSize < float64(bodySize)+4 {
|
||||||
@ -143,14 +159,15 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
level = 2
|
level = 2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 合并紧随其后的同字体行(处理跨行标题)
|
||||||
fullText := line.Text
|
fullText := line.Text
|
||||||
lastY := line.Y
|
lastY := line.Y
|
||||||
for k := j + 1; k < len(lines); k++ {
|
for j := i + 1; j < len(lines); j++ {
|
||||||
if lines[k].Text != "" && MathAbs(lines[k].MaxFontSize-line.MaxFontSize) < 1.0 {
|
if lines[j].Text != "" && MathAbs(lines[j].MaxFontSize-line.MaxFontSize) < 1.0 {
|
||||||
if MathAbs(lines[k].Y-lastY) < 25 {
|
if MathAbs(lines[j].Y-lastY) < 25 {
|
||||||
fullText += " " + lines[k].Text
|
fullText += " " + lines[j].Text
|
||||||
lastY = lines[k].Y
|
lastY = lines[j].Y
|
||||||
j = k
|
i = j
|
||||||
} else {
|
} else {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -159,100 +176,100 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if j < 5 && i == 1 {
|
// 检查是否重复
|
||||||
|
if i < 5 {
|
||||||
if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
|
if title, ok := p.Metadata["title"].(string); ok && strings.Contains(title, fullText) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(allBlocks) > 0 && allBlocks[len(allBlocks)-1].Type == "heading" && allBlocks[len(allBlocks)-1].Level == level {
|
// 合并同级标题
|
||||||
allBlocks[len(allBlocks)-1].Text += " " + fullText
|
if len(blocks) > 0 && blocks[len(blocks)-1].Type == "heading" && blocks[len(blocks)-1].Level == level {
|
||||||
|
blocks[len(blocks)-1].Text += " " + fullText
|
||||||
} else {
|
} else {
|
||||||
allBlocks = append(allBlocks, Block{Type: "heading", Level: level, Text: fullText})
|
blocks = append(blocks, Block{Type: "heading", Level: level, Text: fullText})
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// 跳过页码
|
||||||
if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
|
if (line.Y < 50 || line.Y > 800) && isPageNumber(line.Text) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 段落识别
|
||||||
fullText := line.Text
|
fullText := line.Text
|
||||||
lastY := line.Y
|
lastY := line.Y
|
||||||
for k := j + 1; k < len(lines); k++ {
|
for j := i + 1; j < len(lines); j++ {
|
||||||
if lines[k].Text != "" && lines[k].MaxFontSize <= float64(bodySize)+1 {
|
if lines[j].Text != "" && lines[j].MaxFontSize <= float64(bodySize)+1 {
|
||||||
isT, _ := p.identifyTableLine(lines[k])
|
isT, _ := p.identifyTableLine(lines[j])
|
||||||
if isT {
|
if isT {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if (lines[k].Y < 50 || lines[k].Y > 800) && isPageNumber(lines[k].Text) {
|
if (lines[j].Y < 50 || lines[j].Y > 800) && isPageNumber(lines[j].Text) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if MathAbs(lines[k].Y-lastY) > 25 {
|
if MathAbs(lines[j].Y-lastY) > 25 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
fullText += lines[k].Text
|
fullText += lines[j].Text
|
||||||
lastY = lines[k].Y
|
lastY = lines[j].Y
|
||||||
j = k
|
i = j
|
||||||
|
|
||||||
if isPunctuation(lines[k].Text) {
|
if isPunctuation(lines[j].Text) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
allBlocks = append(allBlocks, Block{Type: "paragraph", Text: fullText})
|
blocks = append(blocks, Block{Type: "paragraph", Text: fullText})
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. 智能封面/标题逻辑 (全局)
|
// 4. 智能封面/标题逻辑
|
||||||
|
// 策略:
|
||||||
|
// 1. 第一个 Heading 始终是文档标题 (#)
|
||||||
|
// 2. 在遇到第一个明确的“章节标题”或“正文段落”之前,中间的 Heading 如果很长,则转为正文文本。
|
||||||
|
// 3. 统计 H1,如果只有一个 H1,则尝试将 H2 提升为 H1。
|
||||||
|
|
||||||
h1Count := 0
|
h1Count := 0
|
||||||
for _, b := range allBlocks {
|
for _, b := range blocks {
|
||||||
if b.Type == "heading" && b.Level == 1 {
|
if b.Type == "heading" && b.Level == 1 {
|
||||||
h1Count++
|
h1Count++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
shouldPromote := h1Count <= 1
|
shouldPromote := h1Count <= 1
|
||||||
|
|
||||||
|
firstHeadingFound := false
|
||||||
contentStarted := false
|
contentStarted := false
|
||||||
hasMetadataTitle := false
|
|
||||||
if t, ok := p.Metadata["title"].(string); ok && t != "" {
|
|
||||||
hasMetadataTitle = true
|
|
||||||
}
|
|
||||||
|
|
||||||
firstHeadingProcessed := false
|
var res strings.Builder
|
||||||
|
for _, b := range blocks {
|
||||||
var sb strings.Builder
|
|
||||||
for _, b := range allBlocks {
|
|
||||||
if b.Type == "heading" {
|
if b.Type == "heading" {
|
||||||
level := b.Level
|
level := b.Level
|
||||||
|
|
||||||
if !contentStarted {
|
if !contentStarted {
|
||||||
if !firstHeadingProcessed {
|
if !firstHeadingFound {
|
||||||
firstHeadingProcessed = true
|
// 文档总标题
|
||||||
// 如果有 Metadata Title,则 Metadata Title 充当了真正的第一级标题
|
res.WriteString("\n# " + b.Text + "\n\n")
|
||||||
// 我们把遇到的第一个大标题降级为正文文本(除非它已经是标准章节)
|
firstHeadingFound = true
|
||||||
if hasMetadataTitle && !isStandardSection(b.Text) {
|
|
||||||
sb.WriteString("\n" + b.Text + "\n\n")
|
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
// 否则作为文档的主标题
|
// 封面期间的其他标题
|
||||||
sb.WriteString("\n# " + b.Text + "\n\n")
|
// 如果是已知的章节名,则认为内容开始了
|
||||||
continue
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if isStandardSection(b.Text) {
|
if isStandardSection(b.Text) {
|
||||||
contentStarted = true
|
contentStarted = true
|
||||||
sb.WriteString("\n# " + b.Text + "\n\n")
|
// 章节名也应该是 #
|
||||||
|
res.WriteString("\n# " + b.Text + "\n\n")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// 否则作为封面副标题/文本
|
// 否则作为封面副标题/文本
|
||||||
sb.WriteString(b.Text + "\n\n")
|
res.WriteString(b.Text + "\n\n")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 内容已经开始
|
||||||
if shouldPromote {
|
if shouldPromote {
|
||||||
if level == 2 {
|
if level == 2 {
|
||||||
level = 1
|
level = 1
|
||||||
@ -260,18 +277,17 @@ func OpenPDF(filename string) (*PDF, error) {
|
|||||||
level = 2
|
level = 2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sb.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
|
res.WriteString("\n" + strings.Repeat("#", level) + " " + b.Text + "\n\n")
|
||||||
} else if b.Type == "paragraph" {
|
} else if b.Type == "paragraph" {
|
||||||
contentStarted = true
|
contentStarted = true // 遇到第一个正文段落,标志着封面/标题期结束
|
||||||
sb.WriteString(b.Text + "\n\n")
|
res.WriteString(b.Text + "\n\n")
|
||||||
} else if b.Type == "table" {
|
} else if b.Type == "table" {
|
||||||
contentStarted = true
|
contentStarted = true
|
||||||
sb.WriteString(b.Text + "\n")
|
res.WriteString(b.Text + "\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
p.Content = strings.TrimSpace(sb.String())
|
return res.String()
|
||||||
return p, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isStandardSection(s string) bool {
|
func isStandardSection(s string) bool {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user