good-news/services/ocr_service.go

249 lines
6.0 KiB
Go
Raw Normal View History

2025-04-03 02:23:49 +00:00
package services
import (
"github.com/otiai10/gosseract/v2"
"regexp"
"strconv"
"strings"
)
// OCRService 提供OCR文字识别服务
type OCRService struct {
client *gosseract.Client
}
// NewOCRService 创建新的OCR服务实例
func NewOCRService() (*OCRService, error) {
client := gosseract.NewClient()
return &OCRService{client: client}, nil
}
// Close 关闭OCR服务
func (s *OCRService) Close() {
s.client.Close()
}
// ExtractInfo 从图片中提取喜报信息
2025-04-07 03:18:38 +00:00
func (s *OCRService) ExtractInfo(imagePath string) (string, []int, string, error) {
// 设置中文语言包和OCR配置
2025-04-03 02:23:49 +00:00
err := s.client.SetLanguage("chi_sim")
if err != nil {
2025-04-07 03:18:38 +00:00
return "", nil, "", err
}
// 设置Page Segmentation Mode为自动
err = s.client.SetPageSegMode(gosseract.PSM_AUTO)
if err != nil {
return "", nil, "", err
}
// 设置OCR引擎参数
configs := []struct {
key string
value string
}{
{"tessedit_ocr_engine_mode", "2"}, // LSTM only
{"tessedit_enable_dict_correction", "1"}, // 启用字典校正
{"tessedit_pageseg_mode", "3"}, // 完全自动页面分割但没有OSD
{"tessedit_do_invert", "0"}, // 不反转图像
{"textord_heavy_nr", "1"}, // 处理粗体文本
{"language_model_penalty_non_dict_word", "0.2"}, // 降低非字典词的惩罚
{"language_model_penalty_non_freq_dict_word", "0.2"}, // 降低非常用词的惩罚
{"tessedit_write_images", "1"}, // 输出调试图像
}
for _, cfg := range configs {
err = s.client.SetVariable(gosseract.SettableVariable(cfg.key), cfg.value)
if err != nil {
return "", nil, "", err
}
2025-04-03 02:23:49 +00:00
}
// 设置图片
err = s.client.SetImage(imagePath)
if err != nil {
2025-04-07 03:18:38 +00:00
return "", nil, "", err
2025-04-03 02:23:49 +00:00
}
// 获取文本
text, err := s.client.Text()
if err != nil {
2025-04-07 03:18:38 +00:00
return "", nil, "", err
2025-04-03 02:23:49 +00:00
}
// 提取项目名称
projectName := extractProjectName(text)
// 提取点数
points := extractPoints(text)
// 提取代表处
representative := extractRepresentative(text)
return projectName, points, representative, nil
}
// 提取项目名称
func extractProjectName(text string) string {
2025-04-07 03:18:38 +00:00
// 按行分割文本
2025-04-03 02:23:49 +00:00
lines := strings.Split(text, "\n")
2025-04-07 03:18:38 +00:00
// 定义项目相关的正则表达式模式
projectPatterns := []*regexp.Regexp{
regexp.MustCompile(`([\p{Han}]+)项目`),
regexp.MustCompile(`项目[:]*\s*([\p{Han}]+)`),
regexp.MustCompile(`([\p{Han}]+(?:工程|系统))`),
}
// 遍历每一行文本
2025-04-03 02:23:49 +00:00
for _, line := range lines {
2025-04-07 03:18:38 +00:00
line = strings.TrimSpace(line)
if line == "" {
continue
}
// 使用正则表达式匹配项目名称
for _, pattern := range projectPatterns {
matches := pattern.FindStringSubmatch(line)
if len(matches) > 1 {
name := strings.TrimSpace(matches[1])
if name != "" && len(name) >= 2 { // 确保项目名称至少包含两个汉字
return name
}
}
2025-04-03 02:23:49 +00:00
}
}
return ""
}
// 提取点数
2025-04-07 03:18:38 +00:00
func extractPoints(text string) []int {
var points []int
// 将文本按行分割
lines := strings.Split(text, "\n")
// 存储可能的点数
var possiblePoints []int
// 遍历所有行,找出包含"点"字的行,并在其之前的行中查找数字
for i, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
// 如果当前行包含"点"字
if strings.Contains(line, "点") {
// 向上查找最多3行
startIdx := maxInt(0, i-3)
// 检查当前行之前的行
for j := startIdx; j <= i; j++ {
prevLine := strings.TrimSpace(lines[j])
if prevLine == "" {
continue
}
// 提取行中的数字
numPattern := regexp.MustCompile(`(\d+)`)
matches := numPattern.FindAllStringSubmatch(prevLine, -1)
for _, match := range matches {
if len(match) >= 2 {
if num, err := strconv.Atoi(match[1]); err == nil {
if num > 0 && num <= 1000 {
possiblePoints = append(possiblePoints, num)
}
}
}
}
}
}
}
// 去重并返回结果
pointsMap := make(map[int]bool)
for _, num := range possiblePoints {
if !pointsMap[num] {
points = append(points, num)
pointsMap[num] = true
}
}
return points
2025-04-03 02:23:49 +00:00
}
// 提取代表处
2025-04-07 03:18:38 +00:00
// maxInt 返回两个整数中的较大值
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// minInt 返回两个整数中的较小值
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
2025-04-03 02:23:49 +00:00
func extractRepresentative(text string) string {
2025-04-07 03:18:38 +00:00
// 将文本按行分割
lines := strings.Split(text, "\n")
// 定义代表处相关的关键词和对应的正则表达式模式
patterns := map[string]string{
"代表处": `([\p{Han}]{2,}代表处)`,
"事业部": `([\p{Han}]{2,}事业部)`,
"项目组": `([\p{Han}]{2,}项目组)`,
2025-04-03 02:23:49 +00:00
}
2025-04-07 03:18:38 +00:00
// 遍历每一行文本
for _, line := range lines {
// 移除多余空格
line = strings.TrimSpace(line)
// 跳过空行
if line == "" {
continue
}
// 遍历所有模式进行匹配
for keyword, pattern := range patterns {
if strings.Contains(line, keyword) {
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(line)
if len(matches) > 1 {
// 返回匹配到的完整名称
return matches[1]
}
}
}
}
// 如果没有找到完整匹配,尝试提取可能的组织名称
orgPatterns := []string{
`([\p{Han}]{2,}(?:组|部|处|司|中心))`,
`([\p{Han}]{2,}(?:公司|单位))`,
}
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
for _, pattern := range orgPatterns {
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(line)
if len(matches) > 1 {
return matches[1]
}
}
}
2025-04-03 02:23:49 +00:00
return ""
}