002 ES NGram 分词 + suggest

2022-03-11 作者: 小林-1025

我们继续解决我们之前的需求，此功能可满足以下智能纠错+查询功能需求

需求描述：

在输入“使用手川”，即在输入错误的情况下，实现的检索结果需要包括“使用手册”和“使用手川”

在输入“马央九”，即在输入错误的情况下，实现的检索结果需要包括“马英九”和“马央九”

解决方案：

首先，对目标字段使用NGram进行分词，然后使用 term suggest 拿到es根据索引给出的suggest结果

再次，使用“检索关键词”+上一步返回的options，进行最大范围的term检索

遗留问题：

（1）对单个词给的options并不理想

（2）通过_analyze 可以知道ngram分词会导致大量的term，index占用空间比较大

遗留问题方案：

（1）因为此次需求的业务方并没有精准单一词汇的检索需求，最后，业务方在关键字上做了处理

（2）业务方数据量并不是很大，而且属于离线任务

总结：

（1）假如单纯实现在检索框中输入文字，然后给出下拉提示的功能，可以对用户的检索词进行记录，然后通过prefix completion 功能完成

（2）为什么不是采用ik_max_word，然后检索的时候经过分词不就可以达到最大范围的覆盖查询了吗？原因有以下两点

a. 先从分词器上看，"使用手册"的分词结果包括“使用”“用手”，在几十万文档的情况下，包括这类关键词的返回hit非常多，然而这不是实际需要的数据，所以我们在查询上需要使用精准的term查询，返回的数据在业务上才可以使用

b. 再者，业务上需要检索的关键字并不一定在ik的分词范围之内，不太可能每次去修改dict然后进行索引重建

PUT /blogs_ngram
 {
   "settings": {
     "index.max_ngram_diff": 20,
      "analysis": {
        "analyzer": {
          "ngram_analyzer": {
            "type": "custom",
            "tokenizer": "ngram_tokenizer"
          }
        },
        "tokenizer": {
          "ngram_tokenizer": {
            "type": "ngram",
            "min_gram": 2,
            "max_gram": 20,
            "token_chars": [
              "letter",
              "digit"
            ]
          }
        }
      }
   },
   "mappings": {
      "properties": {
          "name": {
              "type": "text",
              "analyzer": "ngram_analyzer"
          }
      }
   }
 }

POST _bulk/?refresh=true
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "中国工商银行北京分行"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "中国工商银行重庆分行"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "中国工商银行天津分行"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "中国招商银行北京分行"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "工行新疆分行"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "你我她公司使用手册范文"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "测试错别字使用手川"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "马英九受邀请访问日本"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "台湾马英九到访美国"}
 { "index" : { "_index" : "blogs_ngram" } }
 { "name": "马央九先生在吃水果"}

POST /blogs_ngram/_search
{
  "query": {
    "term": {"name": "工商"}
  }
}

# result
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : 0.8014579,
    "hits" : [
      {
        "_index" : "blogs_ngram",
        "_type" : "_doc",
        "_id" : "vlmwHHkBgt5AqocFYkFa",
        "_score" : 0.8014579,
        "_source" : {
          "name" : "中国工商银行北京分行"
        }
      },
      {
        "_index" : "blogs_ngram",
        "_type" : "_doc",
        "_id" : "v1mwHHkBgt5AqocFYkFa",
        "_score" : 0.8014579,
        "_source" : {
          "name" : "中国工商银行重庆分行"
        }
      },
      {
        "_index" : "blogs_ngram",
        "_type" : "_doc",
        "_id" : "wFmwHHkBgt5AqocFYkFa",
        "_score" : 0.8014579,
        "_source" : {
          "name" : "中国工商银行天津分行"
        }
      }
    ]
  }
}

POST /blogs_ngram/_analyze
{
  "analyzer": "ngram_analyzer",
  "text": "中国工商银行重庆分行"
}

# result
{
  "tokens" : [
    {
      "token" : "中国",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "中国工",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "中国工商",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "中国工商银",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "中国工商银行",
      "start_offset" : 0,
      "end_offset" : 6,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "中国工商银行重",
      "start_offset" : 0,
      "end_offset" : 7,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "中国工商银行重庆",
      "start_offset" : 0,
      "end_offset" : 8,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "中国工商银行重庆分",
      "start_offset" : 0,
      "end_offset" : 9,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "中国工商银行重庆分行",
      "start_offset" : 0,
      "end_offset" : 10,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "国工",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "国工商",
      "start_offset" : 1,
      "end_offset" : 4,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "国工商银",
      "start_offset" : 1,
      "end_offset" : 5,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "国工商银行",
      "start_offset" : 1,
      "end_offset" : 6,
      "type" : "word",
      "position" : 12
    },
    {
      "token" : "国工商银行重",
      "start_offset" : 1,
      "end_offset" : 7,
      "type" : "word",
      "position" : 13
    },
    {
      "token" : "国工商银行重庆",
      "start_offset" : 1,
      "end_offset" : 8,
      "type" : "word",
      "position" : 14
    },
    {
      "token" : "国工商银行重庆分",
      "start_offset" : 1,
      "end_offset" : 9,
      "type" : "word",
      "position" : 15
    },
    {
      "token" : "国工商银行重庆分行",
      "start_offset" : 1,
      "end_offset" : 10,
      "type" : "word",
      "position" : 16
    },
    {
      "token" : "工商",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "word",
      "position" : 17
    },
    {
      "token" : "工商银",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "word",
      "position" : 18
    },
    {
      "token" : "工商银行",
      "start_offset" : 2,
      "end_offset" : 6,
      "type" : "word",
      "position" : 19
    },
    {
      "token" : "工商银行重",
      "start_offset" : 2,
      "end_offset" : 7,
      "type" : "word",
      "position" : 20
    },
    {
      "token" : "工商银行重庆",
      "start_offset" : 2,
      "end_offset" : 8,
      "type" : "word",
      "position" : 21
    },
    {
      "token" : "工商银行重庆分",
      "start_offset" : 2,
      "end_offset" : 9,
      "type" : "word",
      "position" : 22
    },
    {
      "token" : "工商银行重庆分行",
      "start_offset" : 2,
      "end_offset" : 10,
      "type" : "word",
      "position" : 23
    },
    {
      "token" : "商银",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "word",
      "position" : 24
    },
    {
      "token" : "商银行",
      "start_offset" : 3,
      "end_offset" : 6,
      "type" : "word",
      "position" : 25
    },
    {
      "token" : "商银行重",
      "start_offset" : 3,
      "end_offset" : 7,
      "type" : "word",
      "position" : 26
    },
    {
      "token" : "商银行重庆",
      "start_offset" : 3,
      "end_offset" : 8,
      "type" : "word",
      "position" : 27
    },
    {
      "token" : "商银行重庆分",
      "start_offset" : 3,
      "end_offset" : 9,
      "type" : "word",
      "position" : 28
    },
    {
      "token" : "商银行重庆分行",
      "start_offset" : 3,
      "end_offset" : 10,
      "type" : "word",
      "position" : 29
    },
    {
      "token" : "银行",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "word",
      "position" : 30
    },
    {
      "token" : "银行重",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "word",
      "position" : 31
    },
    {
      "token" : "银行重庆",
      "start_offset" : 4,
      "end_offset" : 8,
      "type" : "word",
      "position" : 32
    },
    {
      "token" : "银行重庆分",
      "start_offset" : 4,
      "end_offset" : 9,
      "type" : "word",
      "position" : 33
    },
    {
      "token" : "银行重庆分行",
      "start_offset" : 4,
      "end_offset" : 10,
      "type" : "word",
      "position" : 34
    },
    {
      "token" : "行重",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "word",
      "position" : 35
    },
    {
      "token" : "行重庆",
      "start_offset" : 5,
      "end_offset" : 8,
      "type" : "word",
      "position" : 36
    },
    {
      "token" : "行重庆分",
      "start_offset" : 5,
      "end_offset" : 9,
      "type" : "word",
      "position" : 37
    },
    {
      "token" : "行重庆分行",
      "start_offset" : 5,
      "end_offset" : 10,
      "type" : "word",
      "position" : 38
    },
    {
      "token" : "重庆",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "word",
      "position" : 39
    },
    {
      "token" : "重庆分",
      "start_offset" : 6,
      "end_offset" : 9,
      "type" : "word",
      "position" : 40
    },
    {
      "token" : "重庆分行",
      "start_offset" : 6,
      "end_offset" : 10,
      "type" : "word",
      "position" : 41
    },
    {
      "token" : "庆分",
      "start_offset" : 7,
      "end_offset" : 9,
      "type" : "word",
      "position" : 42
    },
    {
      "token" : "庆分行",
      "start_offset" : 7,
      "end_offset" : 10,
      "type" : "word",
      "position" : 43
    },
    {
      "token" : "分行",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "word",
      "position" : 44
    }
  ]
}

POST /blogs_ngram/_search
{ 
  "suggest": {
    "my-suggestion": {
      "text": "使用手册",
      "term": {
        "field": "name",
        "suggest_mode": "always"
      }
    }
  }
}

# result
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "my-suggestion" : [
      {
        "text" : "使用",
        "offset" : 0,
        "length" : 2,
        "options" : [ ]
      },
      {
        "text" : "使用手",
        "offset" : 0,
        "length" : 3,
        "options" : [ ]
      },
      {
        "text" : "使用手册",
        "offset" : 0,
        "length" : 4,
        "options" : [
          {
            "text" : "使用手册范",
            "score" : 0.75,
            "freq" : 1
          },
          {
            "text" : "使用手川",
            "score" : 0.75,
            "freq" : 1
          },
          {
            "text" : "使用手",
            "score" : 0.6666666,
            "freq" : 2
          },
          {
            "text" : "使用手册范文",
            "score" : 0.5,
            "freq" : 1
          }
        ]
      },
      {
        "text" : "用手",
        "offset" : 1,
        "length" : 2,
        "options" : [ ]
      },
      {
        "text" : "用手册",
        "offset" : 1,
        "length" : 3,
        "options" : [ ]
      },
      {
        "text" : "手册",
        "offset" : 2,
        "length" : 2,
        "options" : [ ]
      }
    ]
  }
}

POST /blogs_ngram/_search
{
    "query" : {
        "terms": {
          "name": [
            "使用手川",
            "使用手",
            "使用手册",
            "使用手册范"
          ]
        }
    }
}

# result 
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "blogs_ngram",
        "_type" : "_doc",
        "_id" : "w1mwHHkBgt5AqocFYkFa",
        "_score" : 1.0,
        "_source" : {
          "name" : "你我她公司使用手册范文"
        }
      },
      {
        "_index" : "blogs_ngram",
        "_type" : "_doc",
        "_id" : "xFmwHHkBgt5AqocFYkFa",
        "_score" : 1.0,
        "_source" : {
          "name" : "测试错别字使用手川"
        }
      }
    ]
  }
}

002 ES NGram 分词 + suggest

关于我们

热门标签

Elsewhere