Firstly, follow setup guide for summa and ES
Downloading Data
# Download sample dataset
CURRENT_DUMP=$(curl -s -L "https://dumps.wikimedia.org/other/cirrussearch/current" | grep -oh '\"enwikibooks.*\content.json\.gz\"' | tr -d '"')
wget "https://dumps.wikimedia.org/other/cirrussearch/current/$CURRENT_DUMP" -O enwikibooks.json.gz
gunzip enwikibooks.json.gz
Preparing ES
# Create index schema in file
cat << EOF > es-wiki-schema.json
{
  "settings": {
    "analysis": {
      "analyzer": {
        "whitespace_lowercase": {
          "tokenizer": "whitespace",
          "filter": [ "lowercase" ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "auxiliary_text": {
        "type": "text"
      },
      "category": {
        "type": "text"
      },
      "content_model": {
        "type": "text"
      },
      "incoming_links": {
        "type": "long"
      },
      "language": {
        "type": "text"
      },
      "namespace": {
        "type": "long"
      },
      "opening_text": {
        "type": "text"
      },
      "outgoing_link": {
        "type": "text"
      },
      "popularity_score": {
        "type": "double"
      },
      "text": {
        "type": "text"
      },
      "create_timestamp": {
        "type": "date",
        "format": "strict_date_optional_time||epoch_millis"
      },
      "timestamp": {
        "type": "date",
        "format": "strict_date_optional_time||epoch_millis"
      },
      "title": {
        "type": "text"
      }
    }
  }
}
EOF
# (Optional) Delete previously created index
curl -XDELETE localhost:9200/books
# Create ES index
curl -H "Content-Type: application/json" -XPUT -d @es-wiki-schema.json localhost:9200/books
# Patch dump for ES 8
gsed -i 's/"_type":"books"/"_index":"books"/g' enwikibooks.json
# Import dump into ES
cat enwikibooks.json | parallel --pipe -L 2 -N 2000 -j3 'curl -H "Content-Type: application/json" -s http://localhost:9200/books/_bulk --data-binary @-'
# Do a test query
curl -H "Content-Type: application/json" -s http://localhost:9200/books/_search '{"query": { "match": {"message": {"query": "this is a test"}}}}'
Preparing Summa
Create Index
Summa is a schemaful search engines. It requires from you to define fields what you are going to use. Let’s create a schema for WikiBooks:
# Create index schema in file
cat << EOF > schema.yaml
---
# yamllint disable rule:key-ordering
blocksize: 131072
compression: Zstd
index_name: books
index_attributes:
  conflict_strategy: OVERWRITE_ALWAYS
  description: Wiki
  multi_fields: ["category"]
index_engine:
  file: {}
schema: >
  - name: category
    type: text
    options:
      indexing:
        fieldnorms: true
        record: position
        tokenizer: default
      stored: true
  - name: content_model
    type: text
    options:
      indexing:
        fieldnorms: true
        record: basic
        tokenizer: default
      stored: true
  - name: opening_text
    type: text
    options:
      indexing:
        fieldnorms: true
        record: position
        tokenizer: default
      stored: true
  - name: auxiliary_text
    type: text
    options:
      indexing:
        fieldnorms: true
        record: position
        tokenizer: default
      stored: true
  - name: language
    type: text
    options:
      indexing:
        fieldnorms: true
        record: basic
        tokenizer: default
      stored: true
  - name: title
    type: text
    options:
      indexing:
        fieldnorms: true
        record: position
        tokenizer: default
      stored: true
  - name: text
    type: text
    options:
      indexing:
        fieldnorms: true
        record: position
        tokenizer: default
      stored: true
  - name: timestamp
    type: date
    options:
      fast: true
      fieldnorms: false
      indexed: true
      stored: true
  - name: create_timestamp
    type: date
    options:
      fast: true
      fieldnorms: false
      indexed: true
      stored: true
  - name: popularity_score
    type: f64
    options:
      fast: true
      fieldnorms: false
      indexed: true
      stored: true
  - name: incoming_links
    type: u64
    options:
      fast: true
      fieldnorms: false
      indexed: true
      stored: true
  - name: namespace
    type: u64
    options:
      fast: true
      fieldnorms: false
      indexed: true
      stored: true
EOF
# Create index
summa-cli localhost:8082 - create-index-from-file schema.yaml
Add documents
# Upload a half of documents to Summa. You can upload remaining half by setting `awk 'NR%4==2'`
# It will take a while depending on the performance of your computer
awk 'NR%4==0' enwikibooks.json | summa-cli localhost:8082 - index-document-stream books
# Commit index to make them searchable
summa-cli localhost:8082 - commit-index books
# Do a match query that returns top-10 documents and its total count
summa-cli localhost:8082 search '{"index_alias": "books", "query": {"match": {"value": "astronomy"}}, "collectors": [{"top_docs": {"limit": 10}}, {"count": {}}]}'
Benchmarking
ToDo