Add Docx, Pptx, and PDF parsing support (v1.1.0)
This commit is contained in:
parent
810cdd1fda
commit
44c2eb1439
@ -1,5 +1,12 @@
|
||||
# CHANGELOG
|
||||
|
||||
## v1.1.0 (2026-05-12)
|
||||
|
||||
- 新增 Word (`.docx`) 解析支持,可提取全文纯文本。
|
||||
- 新增 PowerPoint (`.pptx`) 解析支持,可提取幻灯片纯文本。
|
||||
- 新增 PDF 解析支持,可提取全文纯文本及元数据(页数、作者等)。
|
||||
- 保持全纯 Go 实现,无 CGo 依赖。
|
||||
|
||||
## v1.0.0 (2026-05-12)
|
||||
|
||||
- 从 `gojs/office` 迁移并重构为纯 Go 实现。
|
||||
|
||||
85
README.md
85
README.md
@ -1,13 +1,13 @@
|
||||
# office
|
||||
|
||||
极简、高效的 Go Excel 处理库,基于 `excelize` 构建,符合 `@go` 设计哲学。
|
||||
极简、高效的 Go Office 文档处理库,符合 `@go` 设计哲学。支持 Excel、Word (Docx)、PowerPoint (Pptx) 和 PDF 的解析与处理。
|
||||
|
||||
## 特性
|
||||
|
||||
- **统一 API**: 提供极简的 `Open`, `Save`, `Get`, `Set` 等操作。
|
||||
- **自动对齐**: 自动处理工作表的创建和索引。
|
||||
- **对象映射**: 支持将 `[]map[string]any` 直接写入 Excel 或从 Excel 读取。
|
||||
- **高性能**: 尽量减少内存分配和冗余操作。
|
||||
- **统一 API**: 提供极简的 `Open`, `Save`, `Text` 等操作。
|
||||
- **纯 Go 实现**: 无 CGo 依赖,跨平台支持。
|
||||
- **解析与识别**: 支持从 Docx、Pptx 和 PDF 中提取纯文本内容。
|
||||
- **Excel 增强**: 自动处理工作表对齐,支持对象列表 (`[]map`) 的直接读写。
|
||||
|
||||
## 快速开始
|
||||
|
||||
@ -17,56 +17,59 @@
|
||||
go get apigo.cc/go/office
|
||||
```
|
||||
|
||||
### 基础用法
|
||||
### Excel 处理
|
||||
|
||||
```go
|
||||
import "apigo.cc/go/office"
|
||||
|
||||
// 创建并写入
|
||||
// 写入数据
|
||||
xls := office.New()
|
||||
table := [][]any{
|
||||
{"Name", "Age"},
|
||||
{"Alice", 25},
|
||||
{"Bob", 30},
|
||||
}
|
||||
xls.Set("Sheet1", table, "A1", "")
|
||||
xls.Set("Sheet1", [][]any{{"Name", "Age"}, {"Alice", 25}}, "A1", "")
|
||||
xls.Save("example.xlsx")
|
||||
|
||||
// 读取
|
||||
// 读取对象列表
|
||||
xls2, _ := office.Open("example.xlsx")
|
||||
data, _ := xls2.Get("Sheet1", "A1", "")
|
||||
data, _ := xls2.GetData("Sheet1", "A1", "")
|
||||
```
|
||||
|
||||
### 对象列表操作
|
||||
### Word (Docx) 解析
|
||||
|
||||
```go
|
||||
data := []map[string]any{
|
||||
{"Name": "Alice", "Age": 25},
|
||||
{"Name": "Bob", "Age": 30},
|
||||
}
|
||||
xls.SetData("Users", data, "A1", "")
|
||||
doc, _ := office.OpenDocx("contract.docx")
|
||||
text, _ := doc.Text() // 提取全文文本
|
||||
fmt.Println(text)
|
||||
```
|
||||
|
||||
### PowerPoint (Pptx) 解析
|
||||
|
||||
```go
|
||||
ppt, _ := office.OpenPptx("presentation.pptx")
|
||||
text, _ := ppt.Text() // 提取幻灯片全文
|
||||
```
|
||||
|
||||
### PDF 解析
|
||||
|
||||
```go
|
||||
pdf, _ := office.OpenPDF("report.pdf")
|
||||
text, _ := pdf.Text() // 提取 PDF 纯文本
|
||||
info := pdf.Info() // 获取页数、作者等元数据
|
||||
```
|
||||
|
||||
## API 参考
|
||||
|
||||
### 核心函数
|
||||
### Excel
|
||||
- `New() *Excel`
|
||||
- `Open(filename string, password ...string) (*Excel, error)`
|
||||
- `Set(sheetName string, table [][]any, start, end string) error`
|
||||
- `SetData(sheetName string, data []map[string]any, start, end string) error`
|
||||
|
||||
- `New() *Excel`: 创建新的 Excel 对象。
|
||||
- `Open(filename string, password ...string) (*Excel, error)`: 打开现有文件。
|
||||
### Word (Docx)
|
||||
- `OpenDocx(filename string) (*Docx, error)`
|
||||
- `Text() (string, error)`
|
||||
|
||||
### Excel 方法
|
||||
### PowerPoint (Pptx)
|
||||
- `OpenPptx(filename string) (*Pptx, error)`
|
||||
- `Text() (string, error)`
|
||||
|
||||
- `Save(filename ...string) error`: 保存文件。
|
||||
- `Bytes() ([]byte, error)`: 获取字节切片。
|
||||
- `Set(sheetName string, table [][]any, start, end string) error`: 写入二维数据。
|
||||
- `Get(sheetName string, start, end string) ([][]any, error)`: 读取二维数据。
|
||||
- `SetData(sheetName string, data []map[string]any, start, end string) error`: 写入对象列表。
|
||||
- `GetData(sheetName string, start, end string) ([]map[string]any, error)`: 读取对象列表。
|
||||
- `Sheets() []string`: 获取工作表列表。
|
||||
- `RemoveSheet(sheetName string) error`: 删除工作表。
|
||||
|
||||
### 工具函数
|
||||
|
||||
- `MakeCellID(col, row int) string`: 生成单元格 ID (如 "A1")。
|
||||
- `ParseCellID(cell string) (col, row int)`: 解析单元格 ID。
|
||||
- `MakeColID(col int) string`: 生成列 ID (如 "A")。
|
||||
### PDF
|
||||
- `OpenPDF(filename string) (*PDF, error)`
|
||||
- `Text() (string, error)`
|
||||
- `Info() map[string]any`
|
||||
|
||||
47
docx.go
Normal file
47
docx.go
Normal file
@ -0,0 +1,47 @@
|
||||
package office
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
|
||||
"apigo.cc/go/file"
|
||||
"github.com/young2j/oxmltotext/docxtotext"
|
||||
)
|
||||
|
||||
// Docx 封装了 Word 文档的读取操作。
|
||||
type Docx struct {
|
||||
filename string
|
||||
}
|
||||
|
||||
// OpenDocx 打开一个 Word 文档 (.docx)。
|
||||
func OpenDocx(filename string) (*Docx, error) {
|
||||
if !file.Exists(filename) {
|
||||
return nil, file.ErrNotExist
|
||||
}
|
||||
return &Docx{filename: filename}, nil
|
||||
}
|
||||
|
||||
// Text 提取文档中的所有文本。
|
||||
func (d *Docx) Text() (string, error) {
|
||||
f, err := file.Open(d.filename)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return d.ReadText(f)
|
||||
}
|
||||
|
||||
// ReadText 从 io.Reader 中读取并提取 Word 文本。
|
||||
func (d *Docx) ReadText(r io.Reader) (string, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
res, err := docxtotext.Extract(bytes.NewReader(data), nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
11
go.mod
11
go.mod
@ -5,19 +5,28 @@ go 1.25.0
|
||||
require (
|
||||
apigo.cc/go/cast v1.3.0
|
||||
apigo.cc/go/file v1.3.0
|
||||
github.com/dslipak/pdf v0.0.2
|
||||
github.com/xuri/excelize/v2 v2.10.1
|
||||
github.com/young2j/oxmltotext v1.0.3
|
||||
)
|
||||
|
||||
require (
|
||||
apigo.cc/go/encoding v1.3.0 // indirect
|
||||
apigo.cc/go/rand v1.3.0 // indirect
|
||||
apigo.cc/go/safe v1.3.0 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/andybalholm/brotli v1.0.5 // indirect
|
||||
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c // indirect
|
||||
github.com/klauspost/compress v1.17.0 // indirect
|
||||
github.com/otiai10/gosseract/v2 v2.4.1 // indirect
|
||||
github.com/richardlehane/mscfb v1.0.6 // indirect
|
||||
github.com/richardlehane/msoleps v1.0.6 // indirect
|
||||
github.com/tiendc/go-deepcopy v1.7.2 // indirect
|
||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||
github.com/valyala/fasthttp v1.51.0 // indirect
|
||||
github.com/xuri/efp v0.0.1 // indirect
|
||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect
|
||||
go.uber.org/multierr v1.10.0 // indirect
|
||||
go.uber.org/zap v1.26.0 // indirect
|
||||
golang.org/x/crypto v0.51.0 // indirect
|
||||
golang.org/x/net v0.53.0 // indirect
|
||||
golang.org/x/sys v0.44.0 // indirect
|
||||
|
||||
25
go.sum
25
go.sum
@ -8,13 +8,24 @@ apigo.cc/go/rand v1.3.0 h1:k+UFAhMySwXf+dq8Om9TniZV6fm6gAE0evbrqMEdwQU=
|
||||
apigo.cc/go/rand v1.3.0/go.mod h1:mZ/4Soa3bk+XvDaqPWJuUe1bfEi4eThBj1XmEAuYxsk=
|
||||
apigo.cc/go/safe v1.3.0 h1:uctdAUsphT9p60Tk4oS5xPCe0NoIdOHfsYv4PNS0Rok=
|
||||
apigo.cc/go/safe v1.3.0/go.mod h1:tC9X14V+qh0BqIrVg4UkXbl+2pEN+lj2ZNI8IjDB6Fs=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
|
||||
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c h1:oKR1rVrbYTNvwCdiz/4qP+z0lDHe5XMb0Bn7ijdPPak=
|
||||
github.com/dgrr/quickxml v0.0.0-20201022091424-4977de546d6c/go.mod h1:beyEemCEXTgeWAoLJWZxlgT2vtYIEwGWcjWkGA8OBSQ=
|
||||
github.com/dslipak/pdf v0.0.2 h1:djAvcM5neg9Ush+zR6QXB+VMJzR6TdnX766HPIg1JmI=
|
||||
github.com/dslipak/pdf v0.0.2/go.mod h1:2L3SnkI9cQwnAS9gfPz2iUoLC0rUZwbucpbKi5R1mUo=
|
||||
github.com/klauspost/compress v1.17.0 h1:Rnbp4K9EjcDuVuHtd0dgA4qNuv9yKDYKK1ulpJwgrqM=
|
||||
github.com/klauspost/compress v1.17.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
|
||||
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
|
||||
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/otiai10/gosseract/v2 v2.4.1 h1:G8AyBpXEeSlcq8TI85LH/pM5SXk8Djy2GEXisgyblRw=
|
||||
github.com/otiai10/gosseract/v2 v2.4.1/go.mod h1:1gNWP4Hgr2o7yqWfs6r5bZxAatjOIdqWxJLWsTsembk=
|
||||
github.com/otiai10/mint v1.6.3 h1:87qsV/aw1F5as1eH1zS/yqHY85ANKVMgkDrf9rcxbQs=
|
||||
github.com/otiai10/mint v1.6.3/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/richardlehane/mscfb v1.0.6 h1:eN3bvvZCp00bs7Zf52bxNwAx5lJDBK1tCuH19qq5aC8=
|
||||
@ -27,12 +38,24 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/tiendc/go-deepcopy v1.7.2 h1:Ut2yYR7W9tWjTQitganoIue4UGxZwCcJy3orjrrIj44=
|
||||
github.com/tiendc/go-deepcopy v1.7.2/go.mod h1:4bKjNC2r7boYOkD2IOuZpYjmlDdzjbpTRyCx+goBCJQ=
|
||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||
github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA=
|
||||
github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g=
|
||||
github.com/xuri/efp v0.0.1 h1:fws5Rv3myXyYni8uwj2qKjVaRP30PdjeYe2Y6FDsCL8=
|
||||
github.com/xuri/efp v0.0.1/go.mod h1:ybY/Jr0T0GTCnYjKqmdwxyxn2BQf2RcQIIvex5QldPI=
|
||||
github.com/xuri/excelize/v2 v2.10.1 h1:V62UlqopMqha3kOpnlHy2CcRVw1V8E63jFoWUmMzxN0=
|
||||
github.com/xuri/excelize/v2 v2.10.1/go.mod h1:iG5tARpgaEeIhTqt3/fgXCGoBRt4hNXgCp3tfXKoOIc=
|
||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBLbf3WdLgC29pgyhTjAT/0nuE=
|
||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
|
||||
github.com/young2j/oxmltotext v1.0.3 h1:NyePZnyQiBVE4lMq90eREbf73O+Ofo9qeOG48TFwvME=
|
||||
github.com/young2j/oxmltotext v1.0.3/go.mod h1:LXbK3mqlFNiVQvukLY+lcRHeEwOb1jMu4nZwhBrXay8=
|
||||
go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
|
||||
go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo=
|
||||
go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
|
||||
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
|
||||
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
|
||||
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
|
||||
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
|
||||
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
|
||||
golang.org/x/image v0.25.0 h1:Y6uW6rH1y5y/LK1J8BPWZtr6yZ7hrsy6hFrXjgsc2fQ=
|
||||
|
||||
64
pdf.go
Normal file
64
pdf.go
Normal file
@ -0,0 +1,64 @@
|
||||
package office
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"apigo.cc/go/file"
|
||||
"github.com/dslipak/pdf"
|
||||
)
|
||||
|
||||
// PDF 封装了 PDF 文档的读取操作。
|
||||
type PDF struct {
|
||||
filename string
|
||||
}
|
||||
|
||||
// OpenPDF 打开一个 PDF 文档。
|
||||
func OpenPDF(filename string) (*PDF, error) {
|
||||
if !file.Exists(filename) {
|
||||
return nil, file.ErrNotExist
|
||||
}
|
||||
return &PDF{filename: filename}, nil
|
||||
}
|
||||
|
||||
// Text 提取 PDF 中的所有文本。
|
||||
func (p *PDF) Text() (string, error) {
|
||||
f, err := pdf.Open(p.filename)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
t, err := f.GetPlainText()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
_, err = io.Copy(&b, t)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
// Info 获取 PDF 的元数据。
|
||||
func (p *PDF) Info() map[string]any {
|
||||
f, err := pdf.Open(p.filename)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
info := make(map[string]any)
|
||||
// 常见的 PDF 元数据字段
|
||||
fields := []string{"Title", "Author", "Subject", "Keywords", "Creator", "Producer", "CreationDate", "ModDate"}
|
||||
for _, field := range fields {
|
||||
val := f.GetInfo().Get(field)
|
||||
if val != "" {
|
||||
info[strings.ToLower(field)] = val
|
||||
}
|
||||
}
|
||||
info["pages"] = f.NumPage()
|
||||
return info
|
||||
}
|
||||
47
pptx.go
Normal file
47
pptx.go
Normal file
@ -0,0 +1,47 @@
|
||||
package office
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
|
||||
"apigo.cc/go/file"
|
||||
"github.com/young2j/oxmltotext/pptxtotext"
|
||||
)
|
||||
|
||||
// Pptx 封装了 PowerPoint 文档的读取操作。
|
||||
type Pptx struct {
|
||||
filename string
|
||||
}
|
||||
|
||||
// OpenPptx 打开一个 PowerPoint 文档 (.pptx)。
|
||||
func OpenPptx(filename string) (*Pptx, error) {
|
||||
if !file.Exists(filename) {
|
||||
return nil, file.ErrNotExist
|
||||
}
|
||||
return &Pptx{filename: filename}, nil
|
||||
}
|
||||
|
||||
// Text 提取文档中的所有文本。
|
||||
func (p *Pptx) Text() (string, error) {
|
||||
f, err := file.Open(p.filename)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return p.ReadText(f)
|
||||
}
|
||||
|
||||
// ReadText 从 io.Reader 中读取并提取 PPT 文本。
|
||||
func (p *Pptx) ReadText(r io.Reader) (string, error) {
|
||||
data, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
res, err := pptxtotext.Extract(bytes.NewReader(data), nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user