Add Docx, Pptx, and PDF parsing support (v1.1.0)

This commit is contained in:
Star 2026-05-12 12:30:03 +08:00
parent 810cdd1fda
commit 44c2eb1439
7 changed files with 243 additions and 43 deletions

View File

@ -1,5 +1,12 @@
# CHANGELOG
## v1.1.0 (2026-05-12)
- 新增 Word (`.docx`) 解析支持,可提取全文纯文本。
- 新增 PowerPoint (`.pptx`) 解析支持,可提取幻灯片纯文本。
- 新增 PDF 解析支持,可提取全文纯文本及元数据(页数、作者等)。
- 保持全纯 Go 实现,无 CGo 依赖。
## v1.0.0 (2026-05-12)
- 从 `gojs/office` 迁移并重构为纯 Go 实现。

View File

@ -1,13 +1,13 @@
# office
极简、高效的 Go Excel 处理库,基于 `excelize` 构建,符合 `@go` 设计哲学
极简、高效的 Go Office 文档处理库,符合 `@go` 设计哲学。支持 Excel、Word (Docx)、PowerPoint (Pptx) 和 PDF 的解析与处理
## 特性
- **统一 API**: 提供极简的 `Open`, `Save`, `Get`, `Set` 等操作。
- **自动对齐**: 自动处理工作表的创建和索引
- **对象映射**: 支持将 `[]map[string]any` 直接写入 Excel 或从 Excel 读取
- **高性能**: 尽量减少内存分配和冗余操作
- **统一 API**: 提供极简的 `Open`, `Save`, `Text` 等操作。
- **纯 Go 实现**: 无 CGo 依赖,跨平台支持
- **解析与识别**: 支持从 Docx、Pptx 和 PDF 中提取纯文本内容
- **Excel 增强**: 自动处理工作表对齐,支持对象列表 (`[]map`) 的直接读写
## 快速开始
@ -17,56 +17,59 @@
go get apigo.cc/go/office
```
### 基础用法
### Excel 处理
```go
import "apigo.cc/go/office"
// 创建并写入
// 写入数据
xls := office.New()
table := [][]any{
{"Name", "Age"},
{"Alice", 25},
{"Bob", 30},
}
xls.Set("Sheet1", table, "A1", "")
xls.Set("Sheet1", [][]any{{"Name", "Age"}, {"Alice", 25}}, "A1", "")
xls.Save("example.xlsx")
// 读取
// 读取对象列表
xls2, _ := office.Open("example.xlsx")
data, _ := xls2.Get("Sheet1", "A1", "")
data, _ := xls2.GetData("Sheet1", "A1", "")
```
### 对象列表操作
### Word (Docx) 解析
```go
data := []map[string]any{
{"Name": "Alice", "Age": 25},
{"Name": "Bob", "Age": 30},
}
xls.SetData("Users", data, "A1", "")
doc, _ := office.OpenDocx("contract.docx")
text, _ := doc.Text() // 提取全文文本
fmt.Println(text)
```
### PowerPoint (Pptx) 解析
```go
ppt, _ := office.OpenPptx("presentation.pptx")
text, _ := ppt.Text() // 提取幻灯片全文
```
### PDF 解析
```go
pdf, _ := office.OpenPDF("report.pdf")
text, _ := pdf.Text() // 提取 PDF 纯文本
info := pdf.Info() // 获取页数、作者等元数据
```
## API 参考
### 核心函数
### Excel
- `New() *Excel`
- `Open(filename string, password ...string) (*Excel, error)`
- `Set(sheetName string, table [][]any, start, end string) error`
- `SetData(sheetName string, data []map[string]any, start, end string) error`
- `New() *Excel`: 创建新的 Excel 对象。
- `Open(filename string, password ...string) (*Excel, error)`: 打开现有文件。
### Word (Docx)
- `OpenDocx(filename string) (*Docx, error)`
- `Text() (string, error)`
### Excel 方法
### PowerPoint (Pptx)
- `OpenPptx(filename string) (*Pptx, error)`
- `Text() (string, error)`
- `Save(filename ...string) error`: 保存文件。
- `Bytes() ([]byte, error)`: 获取字节切片。
- `Set(sheetName string, table [][]any, start, end string) error`: 写入二维数据。
- `Get(sheetName string, start, end string) ([][]any, error)`: 读取二维数据。
- `SetData(sheetName string, data []map[string]any, start, end string) error`: 写入对象列表。
- `GetData(sheetName string, start, end string) ([]map[string]any, error)`: 读取对象列表。
- `Sheets() []string`: 获取工作表列表。
- `RemoveSheet(sheetName string) error`: 删除工作表。
### 工具函数
- `MakeCellID(col, row int) string`: 生成单元格 ID (如 "A1")。
- `ParseCellID(cell string) (col, row int)`: 解析单元格 ID。
- `MakeColID(col int) string`: 生成列 ID (如 "A")。
### PDF
- `OpenPDF(filename string) (*PDF, error)`
- `Text() (string, error)`
- `Info() map[string]any`

47
docx.go Normal file
View File

@ -0,0 +1,47 @@
package office
import (
"bytes"
"io"
"apigo.cc/go/file"
"github.com/young2j/oxmltotext/docxtotext"
)
// Docx 封装了 Word 文档的读取操作。
type Docx struct {
filename string
}
// OpenDocx 打开一个 Word 文档 (.docx)。
func OpenDocx(filename string) (*Docx, error) {
if !file.Exists(filename) {
return nil, file.ErrNotExist
}
return &Docx{filename: filename}, nil
}
// Text 提取文档中的所有文本。
func (d *Docx) Text() (string, error) {
f, err := file.Open(d.filename)
if err != nil {
return "", err
}
defer f.Close()
return d.ReadText(f)
}
// ReadText 从 io.Reader 中读取并提取 Word 文本。
func (d *Docx) ReadText(r io.Reader) (string, error) {
data, err := io.ReadAll(r)
if err != nil {
return "", err
}
res, err := docxtotext.Extract(bytes.NewReader(data), nil)
if err != nil {
return "", err
}
return res, nil
}

11
go.mod
View File

@ -5,19 +5,28 @@ go 1.25.0
require (
apigo.cc/go/cast v1.3.0
apigo.cc/go/file v1.3.0
github.com/dslipak/pdf v0.0.2
github.com/xuri/excelize/v2 v2.10.1
github.com/young2j/oxmltotext v1.0.3
)
require (
apigo.cc/go/encoding v1.3.0 // indirect
apigo.cc/go/rand v1.3.0 // indirect
apigo.cc/go/safe v1.3.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c // indirect
github.com/klauspost/compress v1.17.0 // indirect
github.com/otiai10/gosseract/v2 v2.4.1 // indirect
github.com/richardlehane/mscfb v1.0.6 // indirect
github.com/richardlehane/msoleps v1.0.6 // indirect
github.com/tiendc/go-deepcopy v1.7.2 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasthttp v1.51.0 // indirect
github.com/xuri/efp v0.0.1 // indirect
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect
go.uber.org/multierr v1.10.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/crypto v0.51.0 // indirect
golang.org/x/net v0.53.0 // indirect
golang.org/x/sys v0.44.0 // indirect

25
go.sum
View File

@ -8,13 +8,24 @@ apigo.cc/go/rand v1.3.0 h1:k+UFAhMySwXf+dq8Om9TniZV6fm6gAE0evbrqMEdwQU=
apigo.cc/go/rand v1.3.0/go.mod h1:mZ/4Soa3bk+XvDaqPWJuUe1bfEi4eThBj1XmEAuYxsk=
apigo.cc/go/safe v1.3.0 h1:uctdAUsphT9p60Tk4oS5xPCe0NoIdOHfsYv4PNS0Rok=
apigo.cc/go/safe v1.3.0/go.mod h1:tC9X14V+qh0BqIrVg4UkXbl+2pEN+lj2ZNI8IjDB6Fs=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c h1:oKR1rVrbYTNvwCdiz/4qP+z0lDHe5XMb0Bn7ijdPPak=
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c/go.mod h1:beyEemCEXTgeWAoLJWZxlgT2vtYIEwGWcjWkGA8OBSQ=
github.com/dslipak/pdf v0.0.2 h1:djAvcM5neg9Ush+zR6QXB+VMJzR6TdnX766HPIg1JmI=
github.com/dslipak/pdf v0.0.2/go.mod h1:2L3SnkI9cQwnAS9gfPz2iUoLC0rUZwbucpbKi5R1mUo=
github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM=
github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/otiai10/gosseract/v2 v2.4.1 h1:G8AyBpXEeSlcq8TI85LH/pM5SXk8Djy2GEXisgyblRw=
github.com/otiai10/gosseract/v2 v2.4.1/go.mod h1:1gNWP4Hgr2o7yqWfs6r5bZxAatjOIdqWxJLWsTsembk=
github.com/otiai10/mint v1.6.3 h1:87qsV/aw1F5as1eH1zS/yqHY85ANKVMgkDrf9rcxbQs=
github.com/otiai10/mint v1.6.3/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/richardlehane/mscfb v1.0.6 h1:eN3bvvZCp00bs7Zf52bxNwAx5lJDBK1tCuH19qq5aC8=
@ -27,12 +38,24 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/tiendc/go-deepcopy v1.7.2 h1:Ut2yYR7W9tWjTQitganoIue4UGxZwCcJy3orjrrIj44=
github.com/tiendc/go-deepcopy v1.7.2/go.mod h1:4bKjNC2r7boYOkD2IOuZpYjmlDdzjbpTRyCx+goBCJQ=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA=
github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g=
github.com/xuri/efp v0.0.1 h1:fws5Rv3myXyYni8uwj2qKjVaRP30PdjeYe2Y6FDsCL8=
github.com/xuri/efp v0.0.1/go.mod h1:ybY/Jr0T0GTCnYjKqmdwxyxn2BQf2RcQIIvex5QldPI=
github.com/xuri/excelize/v2 v2.10.1 h1:V62UlqopMqha3kOpnlHy2CcRVw1V8E63jFoWUmMzxN0=
github.com/xuri/excelize/v2 v2.10.1/go.mod h1:iG5tARpgaEeIhTqt3/fgXCGoBRt4hNXgCp3tfXKoOIc=
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBLbf3WdLgC29pgyhTjAT/0nuE=
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
github.com/young2j/oxmltotext v1.0.3 h1:NyePZnyQiBVE4lMq90eREbf73O+Ofo9qeOG48TFwvME=
github.com/young2j/oxmltotext v1.0.3/go.mod h1:LXbK3mqlFNiVQvukLY+lcRHeEwOb1jMu4nZwhBrXay8=
go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo=
go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
golang.org/x/image v0.25.0 h1:Y6uW6rH1y5y/LK1J8BPWZtr6yZ7hrsy6hFrXjgsc2fQ=

64
pdf.go Normal file
View File

@ -0,0 +1,64 @@
package office
import (
"bytes"
"io"
"strings"
"apigo.cc/go/file"
"github.com/dslipak/pdf"
)
// PDF 封装了 PDF 文档的读取操作。
type PDF struct {
filename string
}
// OpenPDF 打开一个 PDF 文档。
func OpenPDF(filename string) (*PDF, error) {
if !file.Exists(filename) {
return nil, file.ErrNotExist
}
return &PDF{filename: filename}, nil
}
// Text 提取 PDF 中的所有文本。
func (p *PDF) Text() (string, error) {
f, err := pdf.Open(p.filename)
if err != nil {
return "", err
}
var b bytes.Buffer
t, err := f.GetPlainText()
if err != nil {
return "", err
}
_, err = io.Copy(&b, t)
if err != nil {
return "", err
}
return b.String(), nil
}
// Info 获取 PDF 的元数据。
func (p *PDF) Info() map[string]any {
f, err := pdf.Open(p.filename)
if err != nil {
return nil
}
info := make(map[string]any)
// 常见的 PDF 元数据字段
fields := []string{"Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", "ModDate"}
for _, field := range fields {
val := f.GetInfo().Get(field)
if val != "" {
info[strings.ToLower(field)] = val
}
}
info["pages"] = f.NumPage()
return info
}

47
pptx.go Normal file
View File

@ -0,0 +1,47 @@
package office
import (
"bytes"
"io"
"apigo.cc/go/file"
"github.com/young2j/oxmltotext/pptxtotext"
)
// Pptx 封装了 PowerPoint 文档的读取操作。
type Pptx struct {
filename string
}
// OpenPptx 打开一个 PowerPoint 文档 (.pptx)。
func OpenPptx(filename string) (*Pptx, error) {
if !file.Exists(filename) {
return nil, file.ErrNotExist
}
return &Pptx{filename: filename}, nil
}
// Text 提取文档中的所有文本。
func (p *Pptx) Text() (string, error) {
f, err := file.Open(p.filename)
if err != nil {
return "", err
}
defer f.Close()
return p.ReadText(f)
}
// ReadText 从 io.Reader 中读取并提取 PPT 文本。
func (p *Pptx) ReadText(r io.Reader) (string, error) {
data, err := io.ReadAll(r)
if err != nil {
return "", err
}
res, err := pptxtotext.Extract(bytes.NewReader(data), nil)
if err != nil {
return "", err
}
return res, nil
}