good-news/services/ocr_service.go

package services

import (
	"github.com/otiai10/gosseract/v2"
	"regexp"
	"strconv"
	"strings"
)

// OCRService 提供OCR文字识别服务
type OCRService struct {
	client *gosseract.Client
}

// NewOCRService 创建新的OCR服务实例
func NewOCRService() (*OCRService, error) {
	client := gosseract.NewClient()
	return &OCRService{client: client}, nil
}

// Close 关闭OCR服务
func (s *OCRService) Close() {
	s.client.Close()
}

// ExtractInfo 从图片中提取喜报信息
func (s *OCRService) ExtractInfo(imagePath string) (string, []int, string, error) {
	// 设置中文语言包和OCR配置
	err := s.client.SetLanguage("chi_sim")
	if err != nil {
		return "", nil, "", err
	}
	
	// 设置Page Segmentation Mode为自动
	err = s.client.SetPageSegMode(gosseract.PSM_AUTO)
	if err != nil {
		return "", nil, "", err
	}
	
	// 设置OCR引擎参数
	configs := []struct {
		key   string
		value string
	}{
		{"tessedit_ocr_engine_mode", "2"}, // LSTM only
		{"tessedit_enable_dict_correction", "1"}, // 启用字典校正
		{"tessedit_pageseg_mode", "3"}, // 完全自动页面分割，但没有OSD
		{"tessedit_do_invert", "0"}, // 不反转图像
		{"textord_heavy_nr", "1"}, // 处理粗体文本
		{"language_model_penalty_non_dict_word", "0.2"}, // 降低非字典词的惩罚
		{"language_model_penalty_non_freq_dict_word", "0.2"}, // 降低非常用词的惩罚
		{"tessedit_write_images", "1"}, // 输出调试图像
	}

	for _, cfg := range configs {
		err = s.client.SetVariable(gosseract.SettableVariable(cfg.key), cfg.value)
		if err != nil {
			return "", nil, "", err
		}
	}

	// 设置图片
	err = s.client.SetImage(imagePath)
	if err != nil {
		return "", nil, "", err
	}

	// 获取文本
	text, err := s.client.Text()
	if err != nil {
		return "", nil, "", err
	}

	// 提取项目名称
	projectName := extractProjectName(text)

	// 提取点数
	points := extractPoints(text)

	// 提取代表处
	representative := extractRepresentative(text)

	return projectName, points, representative, nil
}

// 提取项目名称
func extractProjectName(text string) string {
	// 按行分割文本
	lines := strings.Split(text, "\n")
	
	// 定义项目相关的正则表达式模式
	projectPatterns := []*regexp.Regexp{
		regexp.MustCompile(`([\p{Han}]+)项目`),
		regexp.MustCompile(`项目[：:]*\s*([\p{Han}]+)`),
		regexp.MustCompile(`([\p{Han}]+(?:工程|系统))`),
	}

	// 遍历每一行文本
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}

		// 使用正则表达式匹配项目名称
		for _, pattern := range projectPatterns {
			matches := pattern.FindStringSubmatch(line)
			if len(matches) > 1 {
				name := strings.TrimSpace(matches[1])
				if name != "" && len(name) >= 2 { // 确保项目名称至少包含两个汉字
					return name
				}
			}
		}
	}
	return ""
}

// 提取点数
func extractPoints(text string) []int {
    var points []int
    
    // 将文本按行分割
    lines := strings.Split(text, "\n")
    
    // 存储可能的点数
    var possiblePoints []int
    
    // 遍历所有行，找出包含"点"字的行，并在其之前的行中查找数字
    for i, line := range lines {
        line = strings.TrimSpace(line)
        if line == "" {
            continue
        }
        
        // 如果当前行包含"点"字
        if strings.Contains(line, "点") {
            // 向上查找最多3行
            startIdx := maxInt(0, i-3)
            
            // 检查当前行之前的行
            for j := startIdx; j <= i; j++ {
                prevLine := strings.TrimSpace(lines[j])
                if prevLine == "" {
                    continue
                }
                
                // 提取行中的数字
                numPattern := regexp.MustCompile(`(\d+)`)
                matches := numPattern.FindAllStringSubmatch(prevLine, -1)
                for _, match := range matches {
                    if len(match) >= 2 {
                        if num, err := strconv.Atoi(match[1]); err == nil {
                            if num > 0 && num <= 1000 {
                                possiblePoints = append(possiblePoints, num)
                            }
                        }
                    }
                }
            }
        }
    }
    
    // 去重并返回结果
    pointsMap := make(map[int]bool)
    for _, num := range possiblePoints {
        if !pointsMap[num] {
            points = append(points, num)
            pointsMap[num] = true
        }
    }
    
    return points
}

// 提取代表处
// maxInt 返回两个整数中的较大值
func maxInt(a, b int) int {
    if a > b {
        return a
    }
    return b
}

// minInt 返回两个整数中的较小值
func minInt(a, b int) int {
    if a < b {
        return a
    }
    return b
}

func extractRepresentative(text string) string {
	// 将文本按行分割
	lines := strings.Split(text, "\n")

	// 定义代表处相关的关键词和对应的正则表达式模式
	patterns := map[string]string{
		"代表处": `([\p{Han}]{2,}代表处)`,
		"事业部": `([\p{Han}]{2,}事业部)`,
		"项目组": `([\p{Han}]{2,}项目组)`,
	}

	// 遍历每一行文本
	for _, line := range lines {
		// 移除多余空格
		line = strings.TrimSpace(line)

		// 跳过空行
		if line == "" {
			continue
		}

		// 遍历所有模式进行匹配
		for keyword, pattern := range patterns {
			if strings.Contains(line, keyword) {
				re := regexp.MustCompile(pattern)
				matches := re.FindStringSubmatch(line)
				if len(matches) > 1 {
					// 返回匹配到的完整名称
					return matches[1]
				}
			}
		}
	}

	// 如果没有找到完整匹配，尝试提取可能的组织名称
	orgPatterns := []string{
		`([\p{Han}]{2,}(?:组|部|处|司|中心))`,
		`([\p{Han}]{2,}(?:公司|单位))`,
	}

	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}

		for _, pattern := range orgPatterns {
			re := regexp.MustCompile(pattern)
			matches := re.FindStringSubmatch(line)
			if len(matches) > 1 {
				return matches[1]
			}
		}
	}

	return ""
}