MySQL 9.1.0
Source Code Documentation
|
Full Text Search functionality. More...
#include <math.h>
#include <sys/types.h>
#include <iomanip>
#include <vector>
#include "dict0dict.h"
#include "fts0ast.h"
#include "fts0fts.h"
#include "fts0pars.h"
#include "fts0plugin.h"
#include "fts0priv.h"
#include "fts0types.h"
#include "ha_prototypes.h"
#include "lob0lob.h"
#include "my_dbug.h"
#include "mysql/strings/m_ctype.h"
#include "row0sel.h"
#include "ut0new.h"
#include "ut0rbt.h"
Classes | |
struct | fts_query_t |
State of an FTS query. More... | |
struct | fts_match_t |
For phrase matching, first we collect the documents and the positions then we match. More... | |
struct | fts_select_t |
For matching tokens in a phrase search. More... | |
struct | fts_proximity_t |
structure defines a set of ranges for original documents, each of which has a minimum position and maximum position. More... | |
struct | fts_phrase_t |
The match positions and tokesn to match. More... | |
struct | fts_phrase_param_t |
Parameter passed to fts phrase match by parser. More... | |
struct | fts_doc_freq_t |
For storing the frequency of a word/term in a document. More... | |
struct | fts_word_freq_t |
To determine the word frequency per document. More... | |
Macros | |
#define | FTS_ELEM(t, n, i, j) (t[(i)*n + (j)]) |
#define | RANK_DOWNGRADE (-1.0F) |
#define | RANK_UPGRADE (1.0F) |
#define | MAX_PROXIMITY_ITEM 128 |
#define | SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 |
#define | SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) |
#define | RANKING_WORDS_INIT_LEN 4 |
Typedefs | |
typedef std::vector< fts_string_t, ut::allocator< fts_string_t > > | word_vector_t |
typedef std::vector< ulint, ut::allocator< ulint > > | pos_vector_t |
Functions | |
int | innobase_fts_nocase_compare (const CHARSET_INFO *cs, const fts_string_t *s1, const fts_string_t *s2) |
Compare two FTS character strings case insensitively according to their charset. More... | |
static bool | fts_query_index_fetch_nodes (void *row, void *user_arg) |
in: pointer to ib_vector_t More... | |
static dberr_t | fts_query_filter_doc_ids (fts_query_t *query, const fts_string_t *word, fts_word_freq_t *word_freq, const fts_node_t *node, void *data, ulint len, bool calc_doc_count) |
in: whether to remember doc count More... | |
static dberr_t | fts_ast_visit_sub_exp (fts_ast_node_t *node, fts_ast_callback visitor, void *arg) |
Process (nested) sub-expression, create a new result set to store the sub-expression result by processing nodes under current sub-expression list. More... | |
static dberr_t | fts_expand_query (dict_index_t *index, fts_query_t *query) |
This function implements a simple "blind" query expansion search: words in documents found in the first search pass will be used as search arguments to search the document again, thus "expand" the search result set. More... | |
static bool | fts_phrase_or_proximity_search (fts_query_t *query, ib_vector_t *tokens) |
This function finds documents that contain all words in a phrase or proximity search. More... | |
static bool | fts_proximity_get_positions (fts_match_t **match, ulint num_match, ulint distance, fts_proximity_t *qualified_pos) |
This function checks whether words in result documents are close to each other (within proximity range as specified by "distance"). More... | |
static int | fts_query_compare_rank (const void *p1, const void *p2) |
Compare two fts_ranking_t instance on their rank value and doc ids in descending order on the rank and ascending order on doc id. More... | |
static void | fts_ranking_words_create (fts_query_t *query, fts_ranking_t *ranking) |
Create words in ranking. More... | |
static void | fts_ranking_words_add (fts_query_t *query, fts_ranking_t *ranking, const fts_string_t *word) |
Add a word into ranking. More... | |
static bool | fts_ranking_words_get_next (const fts_query_t *query, fts_ranking_t *ranking, ulint *pos, fts_string_t *word) |
Get a word from a ranking. More... | |
static fts_word_freq_t * | fts_query_add_word_freq (fts_query_t *query, const fts_string_t *word) |
Add a word if it doesn't exist, to the term freq RB tree. More... | |
static fts_doc_freq_t * | fts_query_add_doc_freq (fts_query_t *query, ib_rbt_t *doc_freqs, doc_id_t doc_id) |
Add a doc id if it doesn't exist, to the doc freq RB tree. More... | |
static void | fts_query_union_doc_id (fts_query_t *query, doc_id_t doc_id, fts_rank_t rank) |
Add the doc id to the query set only if it's not in the deleted array. More... | |
static void | fts_query_remove_doc_id (fts_query_t *query, doc_id_t doc_id) |
Remove the doc id from the query set only if it's not in the deleted set. More... | |
static void | fts_query_change_ranking (fts_query_t *query, doc_id_t doc_id, bool downgrade) |
Find the doc id in the query set but not in the deleted set, artificialy downgrade or upgrade its ranking by a value and make/initialize its ranking under or above its normal range 0 to 1. More... | |
static void | fts_query_intersect_doc_id (fts_query_t *query, doc_id_t doc_id, fts_rank_t rank) |
Check the doc id in the query set only if it's not in the deleted array. More... | |
static void | fts_query_free_doc_ids (fts_query_t *query, ib_rbt_t *doc_ids) |
Free the document ranking rb tree. More... | |
static void | fts_query_add_word_to_document (fts_query_t *query, doc_id_t doc_id, const fts_string_t *word) |
Add the word to the documents "list" of matching words from the query. More... | |
static void | fts_query_check_node (fts_query_t *query, const fts_string_t *token, const fts_node_t *node) |
Check the node ilist. More... | |
static ulint | fts_cache_find_wildcard (fts_query_t *query, const fts_index_cache_t *index_cache, const fts_string_t *token) |
Search index cache for word with wildcard match. More... | |
static dberr_t | fts_query_difference (fts_query_t *query, const fts_string_t *token) |
Set difference. More... | |
static dberr_t | fts_query_intersect (fts_query_t *query, const fts_string_t *token) |
Intersect the token doc ids with the current set. More... | |
static dberr_t | fts_query_cache (fts_query_t *query, const fts_string_t *token) |
Query index cache. More... | |
static dberr_t | fts_query_union (fts_query_t *query, fts_string_t *token) |
Set union. More... | |
static dberr_t | fts_query_process_doc_id (fts_query_t *query, doc_id_t doc_id, fts_rank_t rank) |
Depending upon the current query operator process the doc id. More... | |
static dberr_t | fts_merge_doc_ids (fts_query_t *query, const ib_rbt_t *doc_ids) |
Merge two result sets. More... | |
static byte * | fts_query_skip_word (byte *ptr, const byte *end) |
Skip non-whitespace in a string. More... | |
static bool | fts_query_match_phrase_terms (fts_phrase_t *phrase, byte **start, const byte *end, mem_heap_t *heap) |
Check whether the remaining terms in the phrase match the text. More... | |
static bool | fts_proximity_is_word_in_range (const fts_phrase_t *phrase, byte *start, ulint total_len) |
Callback function to count the number of words in position ranges, and see whether the word count is in specified "phrase->distance". More... | |
static int | fts_query_match_phrase_add_word_for_parser (MYSQL_FTPARSER_PARAM *param, char *word, int word_len, MYSQL_FTPARSER_BOOLEAN_INFO *info) |
FTS plugin parser 'myql_add_word' callback function for phrase match Refer to 'MYSQL_FTPARSER_PARAM' for more detail. More... | |
static bool | fts_query_match_phrase_terms_by_parser (fts_phrase_param_t *phrase_param, st_mysql_ftparser *parser, byte *text, ulint len) |
Check whether the terms in the phrase match the text. More... | |
static bool | fts_query_match_phrase (fts_phrase_t *phrase, byte *start, ulint cur_len, ulint prev_len, mem_heap_t *heap) |
Callback function to fetch and search the document. More... | |
static bool | fts_query_fetch_document (void *row, void *user_arg) |
Callback function to fetch and search the document. More... | |
static dberr_t | fts_query_match_document (ib_vector_t *tokens, fts_get_doc_t *get_doc, fts_match_t *match, ulint distance, st_mysql_ftparser *parser, bool *found) |
Retrieve the document and match the phrase tokens. More... | |
static bool | fts_query_is_in_proximity_range (const fts_query_t *query, fts_match_t **match, fts_proximity_t *qualified_pos) |
This function fetches the original documents and count the words in between matching words to see that is in specified distance. More... | |
static dberr_t | fts_query_search_phrase (fts_query_t *query, ib_vector_t *orig_tokens, ib_vector_t *tokens) |
Iterate over the matched document ids and search the for the actual phrase in the text. More... | |
static void | fts_query_phrase_split (fts_query_t *query, const fts_ast_node_t *node, ib_vector_t *tokens, ib_vector_t *orig_tokens, mem_heap_t *heap) |
Split the phrase into tokens. More... | |
static dberr_t | fts_query_phrase_search (fts_query_t *query, const fts_ast_node_t *node) |
Text/Phrase search. More... | |
static dberr_t | fts_query_execute (fts_query_t *query, fts_string_t *token) |
Find the word and evaluate. More... | |
static byte * | fts_query_get_token (fts_ast_node_t *node, fts_string_t *token) |
Create a wildcard string. More... | |
static dberr_t | fts_query_visitor (fts_ast_oper_t oper, fts_ast_node_t *node, void *arg) |
Visit every node of the AST. More... | |
static dberr_t | fts_query_read_node (fts_query_t *query, const fts_string_t *word, que_node_t *exp) |
Read the FTS INDEX row. More... | |
static void | fts_query_calculate_idf (fts_query_t *query) |
Calculate the inverse document frequency (IDF) for all the terms. More... | |
static void | fts_query_calculate_ranking (const fts_query_t *query, fts_ranking_t *ranking) |
Calculate the ranking of the document. More... | |
static void | fts_query_add_ranking (fts_query_t *query, ib_rbt_t *ranking_tree, const fts_ranking_t *new_ranking) |
Add ranking to the result set. More... | |
float | fts_retrieve_ranking (fts_result_t *result, doc_id_t doc_id) |
Retrieve the FTS Relevance Ranking result for doc with doc_id. More... | |
static fts_result_t * | fts_query_prepare_result (fts_query_t *query, fts_result_t *result) |
Create the result and copy the data to it. More... | |
static fts_result_t * | fts_query_get_result (fts_query_t *query, fts_result_t *result) |
Get the result of the query. More... | |
static void | fts_query_free (fts_query_t *query) |
FTS Query free resources and reset. More... | |
static fts_ast_node_t * | fts_query_parse (fts_query_t *query, byte *query_str, ulint query_len) |
Parse the query using flex/bison or plugin parser. More... | |
static void | fts_query_can_optimize (fts_query_t *query, uint flags) |
FTS Query optimization Set FTS_OPT_RANKING if it is a simple term query. More... | |
dberr_t | fts_query (trx_t *trx, dict_index_t *index, uint flags, const byte *query_str, ulint query_len, fts_result_t **result, ulonglong limit) |
FTS Query entry point. More... | |
void | fts_query_free_result (fts_result_t *result) |
FTS Query free result, returned by fts_query(). More... | |
void | fts_query_sort_result_on_rank (fts_result_t *result) |
FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. More... | |
static void | fts_print_doc_id (fts_query_t *query) |
A debug function to print result doc_id set. More... | |
Variables | |
static ib_rbt_compare | fts_ranking_doc_id_cmp |
Full Text Search functionality.
Created 2007/03/27 Sunny Bains Completed 2011/7/10 Sunny and Jimmy Yang
#define MAX_PROXIMITY_ITEM 128 |
#define RANK_DOWNGRADE (-1.0F) |
#define RANK_UPGRADE (1.0F) |
#define RANKING_WORDS_INIT_LEN 4 |
#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 |
#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) |
typedef std::vector<ulint, ut::allocator<ulint> > pos_vector_t |
typedef std::vector<fts_string_t, ut::allocator<fts_string_t> > word_vector_t |
|
static |
Process (nested) sub-expression, create a new result set to store the sub-expression result by processing nodes under current sub-expression list.
Merge the sub-expression result with that of parent expression list.
[in,out] | node | current root node |
[in,out] | visitor | callback function |
[in,out] | arg | argument for callback |
|
static |
Search index cache for word with wildcard match.
query | in: query instance |
index_cache | in: cache to search |
token | in: token to search |
|
static |
This function implements a simple "blind" query expansion search: words in documents found in the first search pass will be used as search arguments to search the document again, thus "expand" the search result set.
index | in: FTS index to search |
query | in: FTS query instance |
|
static |
Merge two result sets.
query | in,out: query instance |
doc_ids | in: result set to merge |
|
static |
This function finds documents that contain all words in a phrase or proximity search.
And if proximity search, verify the words are close enough to each other, as in specified distance. This function is called for phrase and proximity search.
And if proximity search, verify the words are close enough to each other, as in specified distance. This function is called for phrase and proximity search.
query | in/out: query instance query->doc_ids might be instantiated with qualified doc IDs |
tokens | in: Tokens contain words |
|
static |
A debug function to print result doc_id set.
query | in : tree that stores doc_ids. |
|
static |
This function checks whether words in result documents are close to each other (within proximity range as specified by "distance").
If "distance" is MAX_ULINT, then it will find all combinations of positions of matching words and store min and max positions in the "qualified_pos" for later verification.
If "distance" is MAX_ULINT, then it will find all combinations of positions of matching words and store min and max positions in the "qualified_pos" for later verification.
match | in: query instance |
num_match | in: number of matching items |
distance | in: distance value for proximity search |
qualified_pos | out: the position info records ranges containing all matching words. |
|
static |
Callback function to count the number of words in position ranges, and see whether the word count is in specified "phrase->distance".
phrase | in: phrase with the search info |
start | in: text to search |
total_len | in: length of text |
dberr_t fts_query | ( | trx_t * | trx, |
dict_index_t * | index, | ||
uint | flags, | ||
const byte * | query_str, | ||
ulint | query_len, | ||
fts_result_t ** | result, | ||
ulonglong | limit | ||
) |
FTS Query entry point.
[in] | trx | transaction |
[in] | index | fts index to search |
[in] | flags | FTS search mode |
[in] | query_str | FTS query |
[in] | query_len | FTS query string len in bytes |
[in,out] | result | result doc ids |
[in] | limit | limit value |
|
static |
Add a doc id if it doesn't exist, to the doc freq RB tree.
query | in: query instance |
doc_freqs | in: rb tree of fts_doc_freq_t |
doc_id | in: doc id to add |
|
static |
Add ranking to the result set.
query | in: query state |
ranking_tree | in: ranking tree |
new_ranking | in: ranking of a document |
|
static |
Add a word if it doesn't exist, to the term freq RB tree.
We store a pointer to the word that is passed in as the argument.
query | in: query instance |
word | in: term/word to add |
|
static |
Add the word to the documents "list" of matching words from the query.
We make a copy of the word from the query heap.
query | in: query to update |
doc_id | in: the document to update |
word | in: the token to add |
|
static |
Query index cache.
query | in/out: query instance |
token | in: token to search |
|
static |
Calculate the inverse document frequency (IDF) for all the terms.
query | in: Query state |
|
static |
Calculate the ranking of the document.
query | in: query state |
ranking | in: Document to rank |
|
static |
FTS Query optimization Set FTS_OPT_RANKING if it is a simple term query.
query | in/out: query instance |
flags | In: FTS search mode |
|
static |
Find the doc id in the query set but not in the deleted set, artificialy downgrade or upgrade its ranking by a value and make/initialize its ranking under or above its normal range 0 to 1.
This is used for Boolean Search operator such as Negation operator, which makes word's contribution to the row's relevance to be negative
query | in: query instance |
doc_id | in: the doc id to add |
downgrade | in: Whether to downgrade ranking |
|
static |
Check the node ilist.
query | in: query to update |
token | in: the token to search |
node | in: node to check |
|
static |
Compare two fts_ranking_t instance on their rank value and doc ids in descending order on the rank and ascending order on doc id.
p1 | in: pointer to elem |
p2 | in: pointer to elem |
|
static |
Set difference.
query | in: query instance |
token | in: token to search |
|
static |
Find the word and evaluate.
query | in: query instance |
token | in: token to search |
|
static |
Callback function to fetch and search the document.
row | in: sel_node_t* |
user_arg | in: fts_doc_t* |
|
static |
in: whether to remember doc count
Read and filter nodes.
query | in: query instance |
word | in: the current word |
word_freq | in/out: word frequency |
node | in: current FTS node |
data | in: doc id ilist |
len | in: doc id ilist size |
calc_doc_count | in: whether to remember doc count |
|
static |
FTS Query free resources and reset.
query | in: query instance to free |
|
static |
Free the document ranking rb tree.
query | in: query instance |
doc_ids | in: rb tree to free |
void fts_query_free_result | ( | fts_result_t * | result | ) |
FTS Query free result, returned by fts_query().
in: result instance to free.
result | in: result instance to free. |
|
static |
Get the result of the query.
Calculate the similarity coefficient.
query | in: query instance |
result | in: result |
|
static |
Create a wildcard string.
It's the responsibility of the caller to free the byte* pointer. It's allocated using ut::malloc_withkey(UT_NEW_THIS_FILE_PSI_KEY).
node | in: the current sub tree |
token | in: token to create |
|
static |
in: pointer to ib_vector_t
Callback function to fetch the rows in an FTS INDEX record.
row | in: sel_node_t* |
user_arg | in: pointer to fts_fetch_t |
|
static |
Intersect the token doc ids with the current set.
query | in: query instance |
token | in: the token to search |
|
static |
Check the doc id in the query set only if it's not in the deleted array.
The doc ids that were found are stored in another rb tree (fts_query_t::intersect).
query | in: query instance |
doc_id | in: the doc id to add |
rank | in: if non-zero, it is the rank associated with the doc_id |
|
static |
This function fetches the original documents and count the words in between matching words to see that is in specified distance.
query | in: query instance |
match | in: query instance |
qualified_pos | in: position info for qualified ranges |
|
static |
Retrieve the document and match the phrase tokens.
tokens | in: phrase tokens |
get_doc | in: table and prepared statements |
match | in: doc id and positions |
distance | in: proximity distance |
parser | in: fts plugin parser |
found | out: true if phrase found |
|
static |
Callback function to fetch and search the document.
[in] | phrase | phrase to match |
[in] | start | text to search, we can't make this const because we need to first convert the string to lowercase |
[in] | cur_len | length of text |
[in] | prev_len | total length for searched doc fields |
[in] | heap | heap |
|
static |
FTS plugin parser 'myql_add_word' callback function for phrase match Refer to 'MYSQL_FTPARSER_PARAM' for more detail.
param | in: parser param |
word | in: token |
word_len | in: token length |
info | in: token info |
|
static |
Check whether the remaining terms in the phrase match the text.
phrase | in: phrase to match |
start | in/out: text to search, we can't make this const because we need to first convert the string to lowercase |
end | in: pointer to the end of the string to search |
heap | in: heap |
|
static |
Check whether the terms in the phrase match the text.
|
static |
Parse the query using flex/bison or plugin parser.
query | in: query instance |
query_str | in: query string |
query_len | in: query string length |
|
static |
Text/Phrase search.
query | in: query instance |
node | in: node to search |
|
static |
Split the phrase into tokens.
[in,out] | query | query instance |
[in] | node | query node to search |
[in,out] | tokens | token vector |
[in,out] | orig_tokens | original node tokens include stopword |
[in,out] | heap | mem heap |
|
static |
Create the result and copy the data to it.
query | in: Query state |
result | in: result this can contain data from a previous search on another FTS index |
|
static |
Depending upon the current query operator process the doc id.
return DB_SUCCESS if all go well or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT
query | in: query instance |
doc_id | in: doc id to process |
rank | in: if non-zero, it is the rank associated with the doc_id |
|
static |
Read the FTS INDEX row.
query | in: query instance |
word | in: current word |
exp | in: query graph node |
|
static |
Remove the doc id from the query set only if it's not in the deleted set.
query | in: query instance |
doc_id | in: the doc id to add |
|
static |
Iterate over the matched document ids and search the for the actual phrase in the text.
query | in: query instance |
orig_tokens | in: tokens to search, with any stopwords in the original phrase |
tokens | in: tokens that does not include stopwords and can be used to calculate ranking |
Skip non-whitespace in a string.
Move ptr to the next word boundary.
ptr | in: start of scan |
end | in: pointer to end of string |
void fts_query_sort_result_on_rank | ( | fts_result_t * | result | ) |
FTS Query sort result, returned by fts_query() on fts_ranking_t::rank.
out: result instance to sort.
result | out: result instance to sort. |
|
static |
Set union.
query | in: query instance |
token | in: token to search |
|
static |
Add the doc id to the query set only if it's not in the deleted array.
query | in: query instance |
doc_id | in: the doc id to add |
rank | in: if non-zero, it is the rank associated with the doc_id |
|
static |
Visit every node of the AST.
oper | in: current operator |
node | in: The root of the current subtree |
arg | in: callback arg |
|
static |
Add a word into ranking.
query | in: query instance |
ranking | in: ranking instance |
word | in: term/word to add |
|
static |
Create words in ranking.
query | in: query instance |
ranking | in: ranking instance |
|
static |
Get a word from a ranking.
query | in: query instance |
ranking | in: ranking instance |
pos | in/out: word start pos |
word | in/out: term/word to add |
float fts_retrieve_ranking | ( | fts_result_t * | result, |
doc_id_t | doc_id | ||
) |
Retrieve the FTS Relevance Ranking result for doc with doc_id.
result | in: FTS result structure |
doc_id | in: doc_id of the item to retrieve |
int innobase_fts_nocase_compare | ( | const CHARSET_INFO * | cs, |
const fts_string_t * | s1, | ||
const fts_string_t * | s2 | ||
) |
Compare two FTS character strings case insensitively according to their charset.
This assumes that s1 is already in lower case.
[in] | cs | character set |
[in] | s1 | key |
[in] | s2 | node |
|
static |