2025-04-03 02:23:49 +00:00
|
|
|
|
package services
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"github.com/otiai10/gosseract/v2"
|
|
|
|
|
|
"regexp"
|
|
|
|
|
|
"strconv"
|
|
|
|
|
|
"strings"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// OCRService 提供OCR文字识别服务
|
|
|
|
|
|
type OCRService struct {
|
|
|
|
|
|
client *gosseract.Client
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// NewOCRService 创建新的OCR服务实例
|
|
|
|
|
|
func NewOCRService() (*OCRService, error) {
|
|
|
|
|
|
client := gosseract.NewClient()
|
|
|
|
|
|
return &OCRService{client: client}, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Close 关闭OCR服务
|
|
|
|
|
|
func (s *OCRService) Close() {
|
|
|
|
|
|
s.client.Close()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ExtractInfo 从图片中提取喜报信息
|
2025-04-07 03:18:38 +00:00
|
|
|
|
func (s *OCRService) ExtractInfo(imagePath string) (string, []int, string, error) {
|
|
|
|
|
|
// 设置中文语言包和OCR配置
|
2025-04-03 02:23:49 +00:00
|
|
|
|
err := s.client.SetLanguage("chi_sim")
|
|
|
|
|
|
if err != nil {
|
2025-04-07 03:18:38 +00:00
|
|
|
|
return "", nil, "", err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 设置Page Segmentation Mode为自动
|
|
|
|
|
|
err = s.client.SetPageSegMode(gosseract.PSM_AUTO)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return "", nil, "", err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 设置OCR引擎参数
|
|
|
|
|
|
configs := []struct {
|
|
|
|
|
|
key string
|
|
|
|
|
|
value string
|
|
|
|
|
|
}{
|
|
|
|
|
|
{"tessedit_ocr_engine_mode", "2"}, // LSTM only
|
|
|
|
|
|
{"tessedit_enable_dict_correction", "1"}, // 启用字典校正
|
|
|
|
|
|
{"tessedit_pageseg_mode", "3"}, // 完全自动页面分割,但没有OSD
|
|
|
|
|
|
{"tessedit_do_invert", "0"}, // 不反转图像
|
|
|
|
|
|
{"textord_heavy_nr", "1"}, // 处理粗体文本
|
|
|
|
|
|
{"language_model_penalty_non_dict_word", "0.2"}, // 降低非字典词的惩罚
|
|
|
|
|
|
{"language_model_penalty_non_freq_dict_word", "0.2"}, // 降低非常用词的惩罚
|
|
|
|
|
|
{"tessedit_write_images", "1"}, // 输出调试图像
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, cfg := range configs {
|
|
|
|
|
|
err = s.client.SetVariable(gosseract.SettableVariable(cfg.key), cfg.value)
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return "", nil, "", err
|
|
|
|
|
|
}
|
2025-04-03 02:23:49 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 设置图片
|
|
|
|
|
|
err = s.client.SetImage(imagePath)
|
|
|
|
|
|
if err != nil {
|
2025-04-07 03:18:38 +00:00
|
|
|
|
return "", nil, "", err
|
2025-04-03 02:23:49 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取文本
|
|
|
|
|
|
text, err := s.client.Text()
|
|
|
|
|
|
if err != nil {
|
2025-04-07 03:18:38 +00:00
|
|
|
|
return "", nil, "", err
|
2025-04-03 02:23:49 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取项目名称
|
|
|
|
|
|
projectName := extractProjectName(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 提取点数
|
|
|
|
|
|
points := extractPoints(text)
|
|
|
|
|
|
|
|
|
|
|
|
// 提取代表处
|
|
|
|
|
|
representative := extractRepresentative(text)
|
|
|
|
|
|
|
|
|
|
|
|
return projectName, points, representative, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取项目名称
|
|
|
|
|
|
func extractProjectName(text string) string {
|
2025-04-07 03:18:38 +00:00
|
|
|
|
// 按行分割文本
|
2025-04-03 02:23:49 +00:00
|
|
|
|
lines := strings.Split(text, "\n")
|
2025-04-07 03:18:38 +00:00
|
|
|
|
|
|
|
|
|
|
// 定义项目相关的正则表达式模式
|
|
|
|
|
|
projectPatterns := []*regexp.Regexp{
|
|
|
|
|
|
regexp.MustCompile(`([\p{Han}]+)项目`),
|
|
|
|
|
|
regexp.MustCompile(`项目[::]*\s*([\p{Han}]+)`),
|
|
|
|
|
|
regexp.MustCompile(`([\p{Han}]+(?:工程|系统))`),
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 遍历每一行文本
|
2025-04-03 02:23:49 +00:00
|
|
|
|
for _, line := range lines {
|
2025-04-07 03:18:38 +00:00
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
if line == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 使用正则表达式匹配项目名称
|
|
|
|
|
|
for _, pattern := range projectPatterns {
|
|
|
|
|
|
matches := pattern.FindStringSubmatch(line)
|
|
|
|
|
|
if len(matches) > 1 {
|
|
|
|
|
|
name := strings.TrimSpace(matches[1])
|
|
|
|
|
|
if name != "" && len(name) >= 2 { // 确保项目名称至少包含两个汉字
|
|
|
|
|
|
return name
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-04-03 02:23:49 +00:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取点数
|
2025-04-07 03:18:38 +00:00
|
|
|
|
func extractPoints(text string) []int {
|
|
|
|
|
|
var points []int
|
|
|
|
|
|
|
|
|
|
|
|
// 将文本按行分割
|
|
|
|
|
|
lines := strings.Split(text, "\n")
|
|
|
|
|
|
|
|
|
|
|
|
// 存储可能的点数
|
|
|
|
|
|
var possiblePoints []int
|
|
|
|
|
|
|
|
|
|
|
|
// 遍历所有行,找出包含"点"字的行,并在其之前的行中查找数字
|
|
|
|
|
|
for i, line := range lines {
|
|
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
if line == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果当前行包含"点"字
|
|
|
|
|
|
if strings.Contains(line, "点") {
|
|
|
|
|
|
// 向上查找最多3行
|
|
|
|
|
|
startIdx := maxInt(0, i-3)
|
|
|
|
|
|
|
|
|
|
|
|
// 检查当前行之前的行
|
|
|
|
|
|
for j := startIdx; j <= i; j++ {
|
|
|
|
|
|
prevLine := strings.TrimSpace(lines[j])
|
|
|
|
|
|
if prevLine == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取行中的数字
|
|
|
|
|
|
numPattern := regexp.MustCompile(`(\d+)`)
|
|
|
|
|
|
matches := numPattern.FindAllStringSubmatch(prevLine, -1)
|
|
|
|
|
|
for _, match := range matches {
|
|
|
|
|
|
if len(match) >= 2 {
|
|
|
|
|
|
if num, err := strconv.Atoi(match[1]); err == nil {
|
|
|
|
|
|
if num > 0 && num <= 1000 {
|
|
|
|
|
|
possiblePoints = append(possiblePoints, num)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 去重并返回结果
|
|
|
|
|
|
pointsMap := make(map[int]bool)
|
|
|
|
|
|
for _, num := range possiblePoints {
|
|
|
|
|
|
if !pointsMap[num] {
|
|
|
|
|
|
points = append(points, num)
|
|
|
|
|
|
pointsMap[num] = true
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return points
|
2025-04-03 02:23:49 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 提取代表处
|
2025-04-07 03:18:38 +00:00
|
|
|
|
// maxInt 返回两个整数中的较大值
|
|
|
|
|
|
func maxInt(a, b int) int {
|
|
|
|
|
|
if a > b {
|
|
|
|
|
|
return a
|
|
|
|
|
|
}
|
|
|
|
|
|
return b
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// minInt 返回两个整数中的较小值
|
|
|
|
|
|
func minInt(a, b int) int {
|
|
|
|
|
|
if a < b {
|
|
|
|
|
|
return a
|
|
|
|
|
|
}
|
|
|
|
|
|
return b
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-04-03 02:23:49 +00:00
|
|
|
|
func extractRepresentative(text string) string {
|
2025-04-07 03:18:38 +00:00
|
|
|
|
// 将文本按行分割
|
|
|
|
|
|
lines := strings.Split(text, "\n")
|
|
|
|
|
|
|
|
|
|
|
|
// 定义代表处相关的关键词和对应的正则表达式模式
|
|
|
|
|
|
patterns := map[string]string{
|
|
|
|
|
|
"代表处": `([\p{Han}]{2,}代表处)`,
|
|
|
|
|
|
"事业部": `([\p{Han}]{2,}事业部)`,
|
|
|
|
|
|
"项目组": `([\p{Han}]{2,}项目组)`,
|
2025-04-03 02:23:49 +00:00
|
|
|
|
}
|
2025-04-07 03:18:38 +00:00
|
|
|
|
|
|
|
|
|
|
// 遍历每一行文本
|
|
|
|
|
|
for _, line := range lines {
|
|
|
|
|
|
// 移除多余空格
|
|
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
|
|
|
|
|
|
// 跳过空行
|
|
|
|
|
|
if line == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 遍历所有模式进行匹配
|
|
|
|
|
|
for keyword, pattern := range patterns {
|
|
|
|
|
|
if strings.Contains(line, keyword) {
|
|
|
|
|
|
re := regexp.MustCompile(pattern)
|
|
|
|
|
|
matches := re.FindStringSubmatch(line)
|
|
|
|
|
|
if len(matches) > 1 {
|
|
|
|
|
|
// 返回匹配到的完整名称
|
|
|
|
|
|
return matches[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果没有找到完整匹配,尝试提取可能的组织名称
|
|
|
|
|
|
orgPatterns := []string{
|
|
|
|
|
|
`([\p{Han}]{2,}(?:组|部|处|司|中心))`,
|
|
|
|
|
|
`([\p{Han}]{2,}(?:公司|单位))`,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, line := range lines {
|
|
|
|
|
|
line = strings.TrimSpace(line)
|
|
|
|
|
|
if line == "" {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for _, pattern := range orgPatterns {
|
|
|
|
|
|
re := regexp.MustCompile(pattern)
|
|
|
|
|
|
matches := re.FindStringSubmatch(line)
|
|
|
|
|
|
if len(matches) > 1 {
|
|
|
|
|
|
return matches[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-04-03 02:23:49 +00:00
|
|
|
|
return ""
|
|
|
|
|
|
}
|