ElasticSearch 学习笔记 - 11.桶聚合

时间 2019-12-10

原文原文链接

一、源数据

DELETE my-index

PUT my-index

PUT my-index/person/1
{
  "name":"张三",
  "age":27,
  "gender":"男",
  "salary":15000,
  "dep":"bigdata"
}

PUT my-index/person/2
{
  "name":"李四",
  "age":26,
  "gender":"女",
  "salary":15000,
  "dep":"bigdata"
}

PUT my-index/person/3
{
  "name":"王五",
  "age":26,
  "gender":"男",
  "salary":17000,
  "dep":"AI"
}
PUT my-index/person/4
{
  "name":"刘六",
  "age":27,
  "gender":"女",
  "salary":18000,
  "dep":"AI"
}

PUT my-index/person/5
{
  "name":"程裕强",
  "age":31,
  "gender":"男",
  "salary":20000,
  "dep":"bigdata"
}
PUT my-index/person/6
{
  "name":"hadron",
  "age":30,
  "gender":"男",
  "salary":20000,
  "dep":"AI"
}

二、Terms Aggregation

根据薪资水平进行分组，统计每一个薪资水平的人数

GET /my-index/person/_search
{
  "size": 0,
  "aggs": {
    "group_count": {
      "terms": {
        "field": "salary"
      }
    }
  }
}

{
  "took": 7,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 6,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_count": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": 15000,
          "doc_count": 2
        },
        {
          "key": 20000,
          "doc_count": 2
        },
        {
          "key": 17000,
          "doc_count": 1
        },
        {
          "key": 18000,
          "doc_count": 1
        }
      ]
    }
  }
}

统计上面每一个分组的平均年龄

GET /my-index/person/_search
{
  "size": 0,
  "aggs": {
    "group_count": {
      "terms": {
        "field": "salary"
      }
      , "aggs": {
        "avg_age": {
          "avg": {
            "field": "age"
          }
        }
      }
    }
  }
}

{
  "took": 6,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 6,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_count": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": 15000,
          "doc_count": 2,
          "avg_age": {
            "value": 26.5
          }
        },
        {
          "key": 20000,
          "doc_count": 2,
          "avg_age": {
            "value": 30.5
          }
        },
        {
          "key": 17000,
          "doc_count": 1,
          "avg_age": {
            "value": 26
          }
        },
        {
          "key": 18000,
          "doc_count": 1,
          "avg_age": {
            "value": 27
          }
        }
      ]
    }
  }
}

统计每一个部门的人数

GET my-index/_search
{
  "size": 0, 
  "aggs": {
    "group_count": {
      "terms": {"field": "dep"}
    }
  }
}

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [dep] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
      }
    ],
    "type": "search_phase_execution_exception",
    "reason": "all shards failed",
    "phase": "query",
    "grouped": true,
    "failed_shards": [
      {
        "shard": 0,
        "index": "my-index",
        "node": "fQDwpdT2RfSfPr8ttHQCkA",
        "reason": {
          "type": "illegal_argument_exception",
          "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [dep] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
        }
      }
    ],
    "caused_by": {
      "type": "illegal_argument_exception",
      "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [dep] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead.",
      "caused_by": {
        "type": "illegal_argument_exception",
        "reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [dep] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead."
      }
    }
  },
  "status": 400
}

根据错误提示”Fielddata is disabled on text fields by default.
Set fielddata=true on [dep] in order to load fielddata in memory by uninverting the inverted index.
Note that this can however use significant memory. Alternatively use a keyword field instead.”可知，须要开启fielddata参数。只须要设置某个字段"fielddata": true便可。
此外，根据官方文档提示se the my_field.keyword field for aggregations, sorting, or in scripts，能够尝试my_field.keyword格式用于聚合操做。node

GET my-index/_search
{
  "size": 0, 
  "aggs": {
    "group_count": {
      "terms": {"field": "dep.keyword"}
    }
  }
}

二、Filter Aggregation

计算男人的平均年龄

也就是统计gender字段包含关键字“男”的文档的age平均值。web

GET my-index/_search
{
  "size": 0, 
  "aggs": {
    "group_count": {
      "filter": {
        "term":{"gender": "男"}
      },
      "aggs":{
        "avg_age":{
          "avg":{"field": "age"}
        }
      }
    }
  }
}

三、Filters Aggregation

GET my-index/_search
{
  "size": 0, 
  "aggs": {
    "group_count": {
      "filters":{
        "filters": [
          {"match":{"gender": "男"}},
          {"match":{"gender": "女"}}
        ]
      },
      "aggs":{
        "avg_age":{
            "avg":{"field": "age"}
        }
      }
    }
  }
}

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 6,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_count": {
      "buckets": [
        {
          "doc_count": 4,
          "avg_age": {
            "value": 28.5
          }
        },
        {
          "doc_count": 2,
          "avg_age": {
            "value": 26.5
          }
        }
      ]
    }
  }
}

四、Range Aggregation

from..to区间范围是[from,to),也就是说包含from点，不包含to点
【例子】查询薪资在[0,10000),[10000,20000),[2000,+无穷大)三个范围的员工数post

GET my-index/_search
{
  "size": 0, 
  "aggs": {
    "group_count": {
      "range": {
        "field": "salary",
        "ranges": [
            {"to": 10000},
            {"from": 10000,"to":20000},  
            {"from": 20000}
        ]
      }
    }
  }
}

{
  "took": 5,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 6,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_count": {
      "buckets": [
        {
          "key": "*-10000.0",
          "to": 10000,
          "doc_count": 0
        },
        {
          "key": "10000.0-20000.0",
          "from": 10000,
          "to": 20000,
          "doc_count": 4
        },
        {
          "key": "20000.0-*",
          "from": 20000,
          "doc_count": 2
        }
      ]
    }
  }
}

五、Date Range聚合

专用于日期值的范围聚合。
这种聚合和正常范围聚合的主要区别在于，起始和结束值能够在日期数学表达式中表示，而且还能够指定返回起始和结束响应字段的日期格式。
请注意，此聚合包含from值并排除每一个范围的值。this

【例子】计算一年前以前发表的博文数和从一年前以来发表的博文总数code

GET website/_search
{
  "size": 0, 
  "aggs": {
    "group_count": {
      "range": {
        "field": "postdate",
        "format":"yyyy-MM-dd",
        "ranges": [
            {"to": "now-12M/M"},
            {"from": "now-12M/M"}
        ]
      }
    }
  }
}



{
  "took": 29,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 9,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "group_count": {
      "buckets": [
        {
          "key": "*-2017-10-01",
          "to": 1506816000000,
          "to_as_string": "2017-10-01",
          "doc_count": 8
        },
        {
          "key": "2017-10-01-*",
          "from": 1506816000000,
          "from_as_string": "2017-10-01",
          "doc_count": 1
        }
      ]
    }
  }
}

六、Missing聚合

基于字段数据的单桶集合，建立当前文档集上下文中缺乏字段值（实际上缺乏字段或设置了配置的NULL值）的全部文档的桶。
此聚合器一般会与其余字段数据存储桶聚合器（如范围）一块儿使用，以返回因为缺乏字段数据值而没法放置在其余存储桶中的全部文档的信息。orm

GET my-index/_search
{
  "size": 0, 
  "aggs": {
    "noDep_count": {
      "missing": {"field": "salary"}
    }
  }
}

{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 9,
    "max_score": 0,
    "hits": []
  },
  "aggregations": {
    "noDep_count": {
      "doc_count": 3
    }
  }
}