2026-05-12 12:30:03 +08:00
|
|
|
package office
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"io"
|
2026-05-12 12:36:41 +08:00
|
|
|
"os"
|
2026-05-12 12:30:03 +08:00
|
|
|
|
|
|
|
|
"apigo.cc/go/file"
|
|
|
|
|
"github.com/young2j/oxmltotext/docxtotext"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Docx 封装了 Word 文档的读取操作。
|
|
|
|
|
type Docx struct {
|
|
|
|
|
filename string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// OpenDocx 打开一个 Word 文档 (.docx)。
|
|
|
|
|
func OpenDocx(filename string) (*Docx, error) {
|
|
|
|
|
if !file.Exists(filename) {
|
2026-05-12 12:36:41 +08:00
|
|
|
return nil, os.ErrNotExist
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
|
|
|
|
return &Docx{filename: filename}, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Text 提取文档中的所有文本。
|
|
|
|
|
func (d *Docx) Text() (string, error) {
|
2026-05-12 12:36:41 +08:00
|
|
|
dp, err := docxtotext.Open(d.filename)
|
2026-05-12 12:30:03 +08:00
|
|
|
if err != nil {
|
|
|
|
|
return "", err
|
|
|
|
|
}
|
2026-05-12 12:36:41 +08:00
|
|
|
defer dp.Close()
|
2026-05-12 12:30:03 +08:00
|
|
|
|
2026-05-12 12:36:41 +08:00
|
|
|
return dp.ExtractTexts()
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ReadText 从 io.Reader 中读取并提取 Word 文本。
|
2026-05-12 12:36:41 +08:00
|
|
|
func (d *Docx) ReadText(r io.ReaderAt, size int64) (string, error) {
|
|
|
|
|
dp, err := docxtotext.OpenReader(r, size)
|
2026-05-12 12:30:03 +08:00
|
|
|
if err != nil {
|
|
|
|
|
return "", err
|
|
|
|
|
}
|
2026-05-12 12:36:41 +08:00
|
|
|
defer dp.Close()
|
|
|
|
|
|
|
|
|
|
return dp.ExtractTexts()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ToMarkdown 将 Word 文档内容转换为 Markdown 格式。
|
|
|
|
|
func (d *Docx) ToMarkdown() (string, error) {
|
|
|
|
|
text, err := d.Text()
|
2026-05-12 12:30:03 +08:00
|
|
|
if err != nil {
|
|
|
|
|
return "", err
|
|
|
|
|
}
|
2026-05-12 12:36:41 +08:00
|
|
|
return text, nil
|
2026-05-12 12:30:03 +08:00
|
|
|
}
|