MySQL 8.4.2
Source Code Documentation
ftdefs.h
Go to the documentation of this file.
1/* Copyright (c) 2000, 2024, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is designed to work with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have either included with
13 the program or referenced in the documentation.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
25
26/**
27 @file storage/myisam/ftdefs.h
28 Some definitions for full-text indices.
29*/
30
31#include <math.h>
32#include <mysql/plugin.h>
33#include <cstdint>
34
35#include "my_tree.h"
39
40inline bool true_word_char(int c, uint8_t ch) {
41 return ((c & (MY_CHAR_U | MY_CHAR_L | MY_CHAR_NMR)) != 0) || ch == '_';
42}
43
44#define FT_MAX_WORD_LEN_FOR_SORT 31
45
46#define FTPARSER_MEMROOT_ALLOC_SIZE 65536
47
48/* Interested readers may consult SMART
49 (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
50 for an excellent implementation of vector space model we use.
51 It also demonstrate the usage of different weghting techniques.
52 This code, though, is completely original and is not based on the
53 SMART code but was in some cases inspired by it.
54
55 NORM_PIVOT was taken from the article
56 A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
57 ACM SIGIR'96, 21-29, 1996
58 */
59
60#define LWS_FOR_QUERY LWS_TF
61#define LWS_IN_USE LWS_LOG
62#define PRENORM_IN_USE PRENORM_AVG
63#define NORM_IN_USE NORM_PIVOT
64#define GWS_IN_USE GWS_PROB
65/*==============================================================*/
66#define LWS_TF (count)
67#define LWS_BINARY (count > 0)
68#define LWS_SQUARE (count * count)
69#define LWS_LOG (count ? (log((double)count) + 1) : 0)
70/*--------------------------------------------------------------*/
71#define PRENORM_NONE (p->weight)
72#define PRENORM_MAX (p->weight / docstat.max)
73#define PRENORM_AUG (0.4 + 0.6 * p->weight / docstat.max)
74#define PRENORM_AVG (p->weight / docstat.sum * docstat.uniq)
75#define PRENORM_AVGLOG \
76 ((1 + log(p->weight)) / (1 + log(docstat.sum / docstat.uniq)))
77/*--------------------------------------------------------------*/
78#define NORM_NONE (1)
79#define NORM_SUM (docstat.nsum)
80#define NORM_COS (sqrt(docstat.nsum2))
81
82#define PIVOT_VAL (0.0115)
83#define NORM_PIVOT (1 + PIVOT_VAL * docstat.uniq)
84/*---------------------------------------------------------------*/
85#define GWS_NORM (1 / sqrt(sum2))
86#define GWS_GFIDF (sum / doc_cnt)
87/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
88#define GWS_IDF log(aio->info->state->records / doc_cnt)
89#define GWS_IDF1 log((double)aio->info->state->records / doc_cnt)
90#define GWS_PROB \
91 ((aio->info->state->records > doc_cnt) \
92 ? log(((double)(aio->info->state->records - doc_cnt)) / doc_cnt) \
93 : 0)
94#define GWS_FREQ (1.0 / doc_cnt)
95#define GWS_SQUARED pow(log((double)aio->info->state->records / doc_cnt), 2)
96#define GWS_CUBIC pow(log((double)aio->info->state->records / doc_cnt), 3)
97#define GWS_ENTROPY \
98 (1 - (suml / sum - log(sum)) / log(aio->info->state->records))
99/*=================================================================*/
100
101/* Boolean search operators */
102#define FTB_YES (ft_boolean_syntax[0])
103#define FTB_EGAL (ft_boolean_syntax[1])
104#define FTB_NO (ft_boolean_syntax[2])
105#define FTB_INC (ft_boolean_syntax[3])
106#define FTB_DEC (ft_boolean_syntax[4])
107#define FTB_LBR (ft_boolean_syntax[5])
108#define FTB_RBR (ft_boolean_syntax[6])
109#define FTB_NEG (ft_boolean_syntax[7])
110#define FTB_TRUNC (ft_boolean_syntax[8])
111#define FTB_LQUOT (ft_boolean_syntax[10])
112#define FTB_RQUOT (ft_boolean_syntax[11])
113
114#ifdef __cplusplus
115extern "C" {
116#endif
117
118struct FT_WORD {
119 uchar *pos;
120 uint len;
121 double weight;
122};
123
124int is_stopword(char *word, uint len);
125
126uint _ft_make_key(MI_INFO *, uint, uchar *, FT_WORD *, my_off_t);
127
130uchar ft_simple_get_word(const CHARSET_INFO *, uchar **, const uchar *,
131 FT_WORD *, bool);
132
133typedef struct _st_ft_seg_iterator {
134 uint num, len;
136 const uchar *rec, *pos;
138
139void _mi_ft_segiterator_init(MI_INFO *, uint, const uchar *, FT_SEG_ITERATOR *);
142
143void ft_parse_init(TREE *, const CHARSET_INFO *);
144int ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser,
147FT_WORD *_mi_ft_parserecord(MI_INFO *, uint, const uchar *, MEM_ROOT *);
148uint _mi_ft_parse(TREE *, MI_INFO *, uint, const uchar *,
150
151FT_INFO *ft_init_nlq_search(MI_INFO *, uint, uchar *, uint, uint, uchar *);
153 const CHARSET_INFO *);
154
155extern const struct _ft_vft _ft_vft_nlq;
156int ft_nlq_read_next(FT_INFO *, char *);
157float ft_nlq_find_relevance(FT_INFO *, uchar *, uint);
162
163extern const struct _ft_vft _ft_vft_boolean;
164
165int ft_boolean_read_next(FT_INFO *, char *);
166float ft_boolean_find_relevance(FT_INFO *, uchar *, uint);
171
174 uint keynr,
175 uint paramnr);
176extern void ftparser_call_deinitializer(MI_INFO *info);
177
178#ifdef __cplusplus
179} // extern "C"
180#endif
FT_WORD * ft_linearize(TREE *, MEM_ROOT *)
Definition: ft_parser.cc:68
void ftparser_call_deinitializer(MI_INFO *info)
Definition: ft_parser.cc:364
void ft_parse_init(TREE *, const CHARSET_INFO *)
Definition: ft_parser.cc:245
FT_INFO * ft_init_nlq_search(MI_INFO *, uint, uchar *, uint, uint, uchar *)
Definition: ft_nlq_search.cc:220
int ft_nlq_read_next(FT_INFO *, char *)
Definition: ft_nlq_search.cc:309
const struct _ft_vft _ft_vft_boolean
Definition: ft_static.cc:62
const struct _ft_vft _ft_vft_nlq
Definition: ft_static.cc:59
int ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser, MYSQL_FTPARSER_PARAM *, MEM_ROOT *)
Definition: ft_parser.cc:291
float ft_nlq_find_relevance(FT_INFO *, uchar *, uint)
Definition: ft_nlq_search.cc:332
void ft_boolean_close_search(FT_INFO *)
Definition: ft_boolean_search.cc:942
bool true_word_char(int c, uint8_t ch)
Definition: ftdefs.h:40
void _mi_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *)
Definition: ft_update.cc:50
void ft_nlq_close_search(FT_INFO *)
Definition: ft_nlq_search.cc:357
float ft_boolean_find_relevance(FT_INFO *, uchar *, uint)
Definition: ft_boolean_search.cc:885
uchar ft_get_word(const CHARSET_INFO *, uchar **, uchar *, FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *)
Definition: ft_parser.cc:121
void ft_nlq_reinit_search(FT_INFO *)
Definition: ft_nlq_search.cc:364
float ft_nlq_get_relevance(FT_INFO *)
Definition: ft_nlq_search.cc:359
my_off_t ft_nlq_get_docid(FT_INFO *)
MYSQL_FTPARSER_PARAM * ftparser_call_initializer(MI_INFO *info, uint keynr, uint paramnr)
Definition: ft_parser.cc:335
uint _mi_ft_parse(TREE *, MI_INFO *, uint, const uchar *, MYSQL_FTPARSER_PARAM *, MEM_ROOT *)
Definition: ft_update.cc:103
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, const uchar *, MEM_ROOT *)
Definition: ft_update.cc:122
struct _st_ft_seg_iterator FT_SEG_ITERATOR
FT_INFO * ft_init_boolean_search(MI_INFO *, uint, uchar *, uint, const CHARSET_INFO *)
Definition: ft_boolean_search.cc:517
int is_stopword(char *word, uint len)
Definition: ft_stopwords.cc:126
uint _mi_ft_segiterator(FT_SEG_ITERATOR *)
Definition: ft_update.cc:69
my_off_t ft_boolean_get_docid(FT_INFO *)
MYSQL_FTPARSER_PARAM * ftparser_alloc_param(MI_INFO *info)
Definition: ft_parser.cc:313
uchar ft_simple_get_word(const CHARSET_INFO *, uchar **, const uchar *, FT_WORD *, bool)
Definition: ft_parser.cc:211
void _mi_ft_segiterator_init(MI_INFO *, uint, const uchar *, FT_SEG_ITERATOR *)
Definition: ft_update.cc:41
int ft_boolean_read_next(FT_INFO *, char *)
Definition: ft_boolean_search.cc:744
uint _ft_make_key(MI_INFO *, uint, uchar *, FT_WORD *, my_off_t)
Definition: ft_update.cc:267
float ft_boolean_get_relevance(FT_INFO *)
Definition: ft_boolean_search.cc:951
void ft_boolean_reinit_search(FT_INFO *)
Definition: ft_boolean_search.cc:956
Some definitions for full-text indices.
A better implementation of the UNIX ctype(3) library.
static constexpr uint8_t MY_CHAR_L
Definition: m_ctype.h:542
static constexpr uint8_t MY_CHAR_NMR
Definition: m_ctype.h:543
static constexpr uint8_t MY_CHAR_U
Definition: m_ctype.h:541
ulonglong my_off_t
Definition: my_inttypes.h:72
unsigned char uchar
Definition: my_inttypes.h:52
struct Parser parser
Code for handling of priority queues.
Definition: m_ctype.h:423
Definition: ft_global.h:72
FTS query token.
Definition: fts0tokenize.h:62
uint len
word len
Definition: fts0tokenize.h:64
double weight
word weight, unused in innodb
Definition: fts0tokenize.h:65
uchar * pos
word start pointer
Definition: fts0tokenize.h:63
Definition: my_compare.h:60
The MEM_ROOT is a simple arena, where allocations are carved out of larger blocks.
Definition: my_alloc.h:83
Definition: myisamdef.h:253
Definition: plugin_ftparser.h:133
Definition: plugin_ftparser.h:195
Definition: my_tree.h:68
Definition: ft_global.h:48
Definition: ftdefs.h:133
uint len
Definition: ftdefs.h:134
HA_KEYSEG * seg
Definition: ftdefs.h:135
const uchar * rec
Definition: ftdefs.h:136
uint num
Definition: ftdefs.h:134
const uchar * pos
Definition: ftdefs.h:136
Definition: plugin_ftparser.h:216