Elasticsearch由浅入深（十）搜索引擎：相关度评分 TF&IDF算法、doc value正排索引、解密query、fetch phrase原理、Bouncing Results问题、基于sc

时间 2019-11-06

标签 elasticsearch 由浅入深搜索引擎相关评分 tf&idf idf 算法 doc value 索引解密 query fetch phrase 原理 bouncing results 问题基于栏目日志分析繁體版

原文原文链接

相关度评分 TF&IDF算法

Elasticsearch的相关度评分（relevance score）算法采用的是term frequency/inverse document frequency算法，简称为TF/IDF算法。node

算法介绍：算法

relevance score算法：简单来讲就是，就是计算出一个索引中的文本，与搜索文本，它们之间的关联匹配程度。
TF/IDF算法：分为两个部分，IF 和IDF
Term Frequency(TF)：搜索文本中的各个词条在field文本中出现了多少次，出现的次数越多，就越相关
例如：
搜索请求：hello world
doc1: hello you, and world is very good
doc2: hello, how are you
那么此时根据TF算法，doc1的相关度要比doc2的要高
Inverse Document Frequency(IDF)：搜索文本中的各个词条在整个索引的全部文档中出现的次数，出现的次数越多，就越不相关。
搜索请求： hello world
doc1: hello, today is very good.
doc2: hi world, how are you.
好比在index中有1万条document, hello这个单词在全部的document中，一共出现了1000次，world这个单词在全部的document中一共出现100次。那么根据IDF算法此时doc2的相关度要比doc1要高。
field-length norm：field-length norm就是field长度越长，相关度就越弱
搜索请求：hello world
doc1: {"title": "hello article", "content": "1万个单词"}
doc2: {"title": "my article", "content": "1万个单词， hi world"}
此时hello world在整个index中出现的次数是同样多的。可是根据Field-length norm此时doc1比doc2相关度要高。由于title字段更短。

_score是如何被计算出来的

GET /test_index/test_type/_search?explain
{
  "query": {
    "match": {
      "test_field": "test hello"
    }
  }
}

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 0.843298,
    "hits": [
      {
        "_shard": "[test_index][2]",
        "_node": "1LdqLFqxQQq4xg2MphI_gw",
        "_index": "test_index",
        "_type": "test_type",
        "_id": "6",
        "_score": 0.843298,
        "_source": {
          "test_field": "test test"
        },
        "_explanation": {
          "value": 0.843298,
          "description": "sum of:",
          "details": [
            {
              "value": 0.843298,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.843298,
                  "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.843298,
                      "description": "score(doc=0,freq=2.0 = termFreq=2.0\n), product of:",
                      "details": [
                        {
                          "value": 0.6931472,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 2,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 4,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 1.2166219,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 2,
                              "description": "termFreq=2.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 1.75,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 2.56,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0,
              "description": "match on required clause, product of:",
              "details": [
                {
                  "value": 0,
                  "description": "# clause",
                  "details": []
                },
                {
                  "value": 1,
                  "description": "_type:test_type, product of:",
                  "details": [
                    {
                      "value": 1,
                      "description": "boost",
                      "details": []
                    },
                    {
                      "value": 1,
                      "description": "queryNorm",
                      "details": []
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[test_index][1]",
        "_node": "1LdqLFqxQQq4xg2MphI_gw",
        "_index": "test_index",
        "_type": "test_type",
        "_id": "8",
        "_score": 0.43445712,
        "_source": {
          "test_field": "test client 2"
        },
        "_explanation": {
          "value": 0.43445715,
          "description": "sum of:",
          "details": [
            {
              "value": 0.43445715,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.43445715,
                  "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.43445715,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                      "details": [
                        {
                          "value": 0.47000363,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 2,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 3,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 0.92436975,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 3.3333333,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 4,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0,
              "description": "match on required clause, product of:",
              "details": [
                {
                  "value": 0,
                  "description": "# clause",
                  "details": []
                },
                {
                  "value": 1,
                  "description": "_type:test_type, product of:",
                  "details": [
                    {
                      "value": 1,
                      "description": "boost",
                      "details": []
                    },
                    {
                      "value": 1,
                      "description": "queryNorm",
                      "details": []
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[test_index][3]",
        "_node": "1LdqLFqxQQq4xg2MphI_gw",
        "_index": "test_index",
        "_type": "test_type",
        "_id": "7",
        "_score": 0.25316024,
        "_source": {
          "test_field": "test client 1"
        },
        "_explanation": {
          "value": 0.25316024,
          "description": "sum of:",
          "details": [
            {
              "value": 0.25316024,
              "description": "sum of:",
              "details": [
                {
                  "value": 0.25316024,
                  "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:",
                  "details": [
                    {
                      "value": 0.25316024,
                      "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                      "details": [
                        {
                          "value": 0.2876821,
                          "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "docFreq",
                              "details": []
                            },
                            {
                              "value": 1,
                              "description": "docCount",
                              "details": []
                            }
                          ]
                        },
                        {
                          "value": 0.88,
                          "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                          "details": [
                            {
                              "value": 1,
                              "description": "termFreq=1.0",
                              "details": []
                            },
                            {
                              "value": 1.2,
                              "description": "parameter k1",
                              "details": []
                            },
                            {
                              "value": 0.75,
                              "description": "parameter b",
                              "details": []
                            },
                            {
                              "value": 3,
                              "description": "avgFieldLength",
                              "details": []
                            },
                            {
                              "value": 4,
                              "description": "fieldLength",
                              "details": []
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0,
              "description": "match on required clause, product of:",
              "details": [
                {
                  "value": 0,
                  "description": "# clause",
                  "details": []
                },
                {
                  "value": 1,
                  "description": "*:*, product of:",
                  "details": [
                    {
                      "value": 1,
                      "description": "boost",
                      "details": []
                    },
                    {
                      "value": 1,
                      "description": "queryNorm",
                      "details": []
                    }
                  ]
                }
              ]
            }
          ]
        }
      }
    ]
  }
}

View Code

doc value正排索引

在咱们搜索的时候，要依靠倒排索引，可是当咱们排序的时候，须要依靠正排索引。经过倒排索引锁定文档document以后，看到每一个document的每一个field，而后进行排序，所谓的正排索引就是doc values。缓存

对于ES而言，在创建索引的时候，一方面会创建倒排索引，以供搜索使用；一方面会创建正排索引，也就是doc values,以供排序，聚合，过滤等使用。ide

doc values是被保存在磁盘上的，此时若是内存足够，OS操做系统会自动将其缓存在内存中，性能仍是会很高的，若是内存不够用，OS操做系统会将其写入磁盘。post

下面举个例子描述正排索引和倒排索引
假设某个index有两个doc性能

doc1 : hello world you and me
doc2 : hi world, how are you

创建倒排索引fetch

word    doc1    doc2
hello    *
world    *        *
you      *        *
and      *
me       *
hi                *
how               *
are               *

假设某个index有两个docui

doc1: {"name": "jack", "age": 27}
doc2: {"name": "tom", "age": 30}

创建正排索引spa

document    name    age
doc1        jack    27
doc2        tom     30

解密query、fetch phrase原理

query pharse

基本原理：操作系统

搜索请求发送到某一个coordinate node协调节点，会构建一个priority queue，长度以paging操做from和size为准，默认是10
coordinate node将请求转发到全部的shard,每一个shard本地搜索，并构建一个本地的priority queue
各个shard将本身的priority queue返回给coordinate node,并构建一个全局的priority queue

fetch phrase

基本原理：

coordinate node协调节点构建完priority queue以后，就发送mget请求去全部shard上获取对应的document
各个shard将document返回给coordinate node
coordinate node将合并后的document结果返回给客户端。

也就是ES的query pharse是根据priority queue去构建搜索结果的

示例

好比总共有60000条数据，三个primary shard,每一个shard上分了20000条数据，每页是10条数据，这个时候，你要搜索到第1000页，实际上要拿到的是10001~10010，也就是会构建一个10010大小的priority queue。

注意这里千万不要理解成每一个shard都是返回10条数据。这样理解是错误的！

下面作一下详细的分析：
请求首先多是打到一个不包含这个index的shard的node上去，这个node就是一个协调节点coordinate node，那么这个coordinate node就会将搜索请求转发到index的三个shard所在的node上去。好比说咱们以前说的状况下，要搜索60000条数据中的第1000页，实际上每一个shard都要将内部的20000条数据中的第10001~10010条数据，拿出来，不是才10条，是10010条数据。3个shard的每一个shard都返回10010条数据给协调节点coordinate node，coordinate node会收到总共30030条数据,此时会构建一个30030大小的priority queue，而后在这些数据中进行排序，根据_score相关度分数，而后取到10001~10010这10条数据，就是咱们要的第1000页的10条数据。
以下图所示：

Bouncing Results问题

想象一下有两个文档有一样值的时间戳字段，搜索结果用 timestamp 字段来排序。因为搜索请求是在全部有效的分片副本间轮询的，那就有可能发生主分片处理请求时，这两个文档是一种顺序，而副本分片处理请求时又是另外一种顺序。

bouncing results 问题:：每次用户刷新页面，搜索结果表现是不一样的顺序。让同一个用户始终使用同一个分片，这样能够避免这种问题，能够设置 preference 参数为一个特定的任意值好比用户会话ID来解决。

偏好这个参数 preference 容许用来控制由哪些分片或节点来处理搜索请求。它接受像 _primary, _primary_first, _local, _only_node:xyz, _prefer_node:xyz, 和 _shards:2,3 这样的值, 这些值在 search preference 文档页面被详细解释。
可是最有用的值是某些随机字符串，它能够避免 bouncing results 问题。
timeout：已经讲解过原理了，主要就是限定在必定时间内，将部分获取到的数据直接返回，避免查询耗时过长
routing：document文档路由，_id路由，routing=user_id，这样的话可让同一个user对应的数据到一个shard上去
search_type：默认default：query_then_fetch，dfs_query_then_fetch能够提高revelance sort精准度

基于scoll技术滚动搜索大量数据

在实际应用中，经过from+size不可避免会出现深分页的瓶颈，那么经过scoll技术就是一个很好的解决深分页的方法。好比若是咱们一次性要查出10万条数据，那么使用from+size很显然性能会很是的差，priority queue会很是的大。此时若是采用scroll滚动查询，就能够一批一批的查，直到全部数据都查询完。

scroll原理

scoll搜索会在第一次搜索的时候，保存一个当时的视图快照，以后只会基于该旧的视图快照提供数据搜索，若是这个期间数据变动，是不会让用户看到的。并且ES内部是基于_doc进行排序的方式，性能较高。
示例：

# 使用scroll 
POST /test_index/_search?scroll=1m
{
  "query": {
    "match_all": {}
  },
  "sort": [
    "_doc"
  ],
  "size": 3
}

获取到scroll_id

{
  "_scroll_id": "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAI-sFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACPqxYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAj68WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAI-tFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACPrhYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3",
  "took": 3,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 12,
    "max_score": null,
    "hits": [
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "AWypxxLYFCl_S-ox4wvd",
        "_score": null,
        "_source": {
          "test_content": "my test"
        },
        "sort": [
          0
        ]
      },
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "6",
        "_score": null,
        "_source": {
          "test_field": "test test"
        },
        "sort": [
          0
        ]
      },
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "7",
        "_score": null,
        "_source": {
          "test_field": "test client 1"
        },
        "sort": [
          0
        ]
      }
    ]
  }
}

View Code

滚动搜索

# 滚动搜索 
POST _search/scroll
{
  "scroll":"1m",
  "scroll_id":"DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAJDMFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQzRYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAkM8WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAJDOFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQ0BYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3"
}

搜索结果

{
  "_scroll_id": "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAJDMFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQzRYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAkM8WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAJDOFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQ0BYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3",
  "took": 1,
  "timed_out": false,
  "terminated_early": true,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 12,
    "max_score": null,
    "hits": [
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "11",
        "_score": null,
        "_source": {
          "num": 0,
          "tags": []
        },
        "sort": [
          0
        ]
      },
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "8",
        "_score": null,
        "_source": {
          "test_field": "test client 2"
        },
        "sort": [
          1
        ]
      },
      {
        "_index": "test_index",
        "_type": "test_type",
        "_id": "4",
        "_score": null,
        "_source": {
          "test_field": "test4"
        },
        "sort": [
          1
        ]
      }
    ]
  }
}

View Code