Firstly, follow setup guide for summa and ES
Downloading Data
# Download sample dataset
CURRENT_DUMP=$(curl -s -L "https://dumps.wikimedia.org/other/cirrussearch/current" | grep -oh '\"enwikibooks.*\content.json\.gz\"' | tr -d '"')
wget "https://dumps.wikimedia.org/other/cirrussearch/current/$CURRENT_DUMP" -O enwikibooks.json.gz
gunzip enwikibooks.json.gz
Preparing ES
# Create index schema in file
cat << EOF > es-wiki-schema.json
{
"settings": {
"analysis": {
"analyzer": {
"whitespace_lowercase": {
"tokenizer": "whitespace",
"filter": [ "lowercase" ]
}
}
}
},
"mappings": {
"properties": {
"auxiliary_text": {
"type": "text"
},
"category": {
"type": "text"
},
"content_model": {
"type": "text"
},
"incoming_links": {
"type": "long"
},
"language": {
"type": "text"
},
"namespace": {
"type": "long"
},
"opening_text": {
"type": "text"
},
"outgoing_link": {
"type": "text"
},
"popularity_score": {
"type": "double"
},
"text": {
"type": "text"
},
"create_timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"title": {
"type": "text"
}
}
}
}
EOF
# (Optional) Delete previously created index
curl -XDELETE localhost:9200/books
# Create ES index
curl -H "Content-Type: application/json" -XPUT -d @es-wiki-schema.json localhost:9200/books
# Patch dump for ES 8
gsed -i 's/"_type":"books"/"_index":"books"/g' enwikibooks.json
# Import dump into ES
cat enwikibooks.json | parallel --pipe -L 2 -N 2000 -j3 'curl -H "Content-Type: application/json" -s http://localhost:9200/books/_bulk --data-binary @-'
# Do a test query
curl -H "Content-Type: application/json" -s http://localhost:9200/books/_search '{"query": { "match": {"message": {"query": "this is a test"}}}}'
Preparing Summa
Create Index
Summa is a schemaful search engines. It requires from you to define fields what you are going to use. Let’s create a schema for WikiBooks:
# Create index schema in file
cat << EOF > schema.yaml
---
# yamllint disable rule:key-ordering
blocksize: 131072
compression: Zstd
index_name: books
index_attributes:
conflict_strategy: OVERWRITE_ALWAYS
description: Wiki
multi_fields: ["category"]
index_engine:
file: {}
schema: >
- name: category
type: text
options:
indexing:
fieldnorms: true
record: position
tokenizer: default
stored: true
- name: content_model
type: text
options:
indexing:
fieldnorms: true
record: basic
tokenizer: default
stored: true
- name: opening_text
type: text
options:
indexing:
fieldnorms: true
record: position
tokenizer: default
stored: true
- name: auxiliary_text
type: text
options:
indexing:
fieldnorms: true
record: position
tokenizer: default
stored: true
- name: language
type: text
options:
indexing:
fieldnorms: true
record: basic
tokenizer: default
stored: true
- name: title
type: text
options:
indexing:
fieldnorms: true
record: position
tokenizer: default
stored: true
- name: text
type: text
options:
indexing:
fieldnorms: true
record: position
tokenizer: default
stored: true
- name: timestamp
type: date
options:
fast: true
fieldnorms: false
indexed: true
stored: true
- name: create_timestamp
type: date
options:
fast: true
fieldnorms: false
indexed: true
stored: true
- name: popularity_score
type: f64
options:
fast: true
fieldnorms: false
indexed: true
stored: true
- name: incoming_links
type: u64
options:
fast: true
fieldnorms: false
indexed: true
stored: true
- name: namespace
type: u64
options:
fast: true
fieldnorms: false
indexed: true
stored: true
EOF
# Create index
summa-cli localhost:8082 - create-index-from-file schema.yaml
Add documents
# Upload a half of documents to Summa. You can upload remaining half by setting `awk 'NR%4==2'`
# It will take a while depending on the performance of your computer
awk 'NR%4==0' enwikibooks.json | summa-cli localhost:8082 - index-document-stream books
# Commit index to make them searchable
summa-cli localhost:8082 - commit-index books
# Do a match query that returns top-10 documents and its total count
summa-cli localhost:8082 search '[{"index_alias": "books", "query": {"match": {"value": "astronomy"}}, "collectors": [{"top_docs": {"limit": 10}}, {"count": {}}]}]'
Benchmarking
ToDo