0
0
mirror of https://github.com/sqlite/sqlite.git synced 2024-11-28 16:09:31 +01:00
sqlite/ext/fts5/test/fts5trigram.test
dan 0cd2ffffb7 Fix the fts5 trigram tokenizer so that it handles non-nul-terminated strings.
FossilOrigin-Name: 84f4e37178a65e3128ac0240d37ac40df08b4050ab070d10707e35d11dcbeb10
2024-11-11 19:49:26 +00:00

367 lines
9.4 KiB
Plaintext

# 2020 September 30
#
# The author disclaims copyright to this source code. In place of
# a legal notice, here is a blessing:
#
# May you do good and not evil.
# May you find forgiveness for yourself and forgive others.
# May you share freely, never taking more than you give.
#
#*************************************************************************
#
# Tests for the fts5 "trigram" tokenizer.
#
source [file join [file dirname [info script]] fts5_common.tcl]
ifcapable !fts5 { finish_test ; return }
set ::testprefix fts5trigram
do_execsql_test 1.0 {
CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram);
INSERT INTO t1 VALUES('abcdefghijklm');
INSERT INTO t1 VALUES('กรุงเทพมหานคร');
}
foreach {tn s res} {
1 abc "(abc)defghijklm"
2 defgh "abc(defgh)ijklm"
3 abcdefghijklm "(abcdefghijklm)"
4 กรุ "(กรุ)งเทพมหานคร"
5 งเทพมห "กรุ(งเทพมห)านคร"
6 กรุงเทพมหานคร "(กรุงเทพมหานคร)"
7 Abc "(abc)defghijklm"
8 deFgh "abc(defgh)ijklm"
9 aBcdefGhijKlm "(abcdefghijklm)"
} {
do_execsql_test 1.1.$tn {
SELECT highlight(t1, 0, '(', ')') FROM t1($s)
} $res
}
do_execsql_test 1.2.0 {
SELECT fts5_expr('ABCD', 'tokenize=trigram')
} {{"abc" + "bcd"}}
do_execsql_test 1.2.1 {
SELECT * FROM t1 WHERE y LIKE ? ESCAPE 'a'
}
foreach {tn like res} {
1 {%cDef%} 1
2 {cDef%} {}
3 {%f%} 1
4 {%f_h%} 1
5 {%f_g%} {}
6 {abc%klm} 1
7 {ABCDEFG%} 1
8 {%รุงเ%} 2
9 {%งเ%} 2
10 {%"งเ"%} {}
} {
do_execsql_test 1.3.$tn {
SELECT rowid FROM t1 WHERE y LIKE $like
} $res
}
#-------------------------------------------------------------------------
reset_db
do_execsql_test 2.0 {
CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize="trigram case_sensitive 1");
INSERT INTO t1 VALUES('abcdefghijklm');
INSERT INTO t1 VALUES('กรุงเทพมหานคร');
}
do_catchsql_test 2.0.1 {
CREATE VIRTUAL TABLE t2 USING fts5(z, tokenize='trigram case_sensitive');
} {1 {error in tokenizer constructor}}
foreach {tn s res} {
1 abc "(abc)defghijklm"
2 defgh "abc(defgh)ijklm"
3 abcdefghijklm "(abcdefghijklm)"
4 กรุ "(กรุ)งเทพมหานคร"
5 งเทพมห "กรุ(งเทพมห)านคร"
6 กรุงเทพมหานคร "(กรุงเทพมหานคร)"
7 Abc ""
8 deFgh ""
9 aBcdefGhijKlm ""
} {
do_execsql_test 2.1.$tn {
SELECT highlight(t1, 0, '(', ')') FROM t1($s)
} $res
}
foreach {tn like res} {
1 {%cDef%} 1
2 {cDef%} {}
3 {%f%} 1
4 {%f_h%} 1
5 {%f_g%} {}
6 {abc%klm} 1
7 {ABCDEFG%} 1
8 {%รุงเ%} 2
} {
do_execsql_test 2.2.$tn {
SELECT rowid FROM t1 WHERE y LIKE $like
} $res
}
foreach {tn like res} {
1 {*cdef*} 1
2 {cdef*} {}
3 {*f*} 1
4 {*f?h*} 1
5 {*f?g*} {}
6 {abc*klm} 1
7 {abcdefg*} 1
8 {*รุงเ*} 2
9 {abc[d]efg*} 1
10 {abc[]d]efg*} 1
11 {abc[^]d]efg*} {}
12 {abc[^]XYZ]efg*} 1
} {
do_execsql_test 2.3.$tn {
SELECT rowid FROM t1 WHERE y GLOB $like
} $res
}
do_execsql_test 2.3.null.1 {
SELECT rowid FROM t1 WHERE y LIKE NULL
}
#-------------------------------------------------------------------------
reset_db
do_catchsql_test 3.1 {
CREATE VIRTUAL TABLE ttt USING fts5(c, tokenize="trigram case_sensitive 2");
} {1 {error in tokenizer constructor}}
do_catchsql_test 3.2 {
CREATE VIRTUAL TABLE ttt USING fts5(c, tokenize="trigram case_sensitive 11");
} {1 {error in tokenizer constructor}}
do_catchsql_test 3.3 {
CREATE VIRTUAL TABLE ttt USING fts5(c, "tokenize=trigram case_sensitive 1");
} {0 {}}
#-------------------------------------------------------------------------
reset_db
do_execsql_test 4.0 {
CREATE VIRTUAL TABLE t0 USING fts5(b, tokenize = "trigram");
}
do_execsql_test 4.1 {
INSERT INTO t0 VALUES (x'000b01');
}
do_execsql_test 4.2 {
INSERT INTO t0(t0) VALUES('integrity-check');
}
#-------------------------------------------------------------------------
reset_db
foreach_detail_mode $::testprefix {
foreach {ci} {0 1} {
reset_db
do_execsql_test 5.cs=$ci.0.1 "
CREATE VIRTUAL TABLE t1 USING fts5(
y, tokenize=\"trigram case_sensitive $ci\", detail=%DETAIL%
);
"
do_execsql_test 5.cs=$ci.0.2 {
INSERT INTO t1 VALUES('abcdefghijklm');
INSERT INTO t1 VALUES('กรุงเทพมหานคร');
}
foreach {tn like res} {
1 {%cDef%} 1
2 {cDef%} {}
3 {%f%} 1
4 {%f_h%} 1
5 {%f_g%} {}
6 {abc%klm} 1
7 {ABCDEFG%} 1
8 {%รุงเ%} 2
} {
do_execsql_test 5.cs=$ci.1.$tn {
SELECT rowid FROM t1 WHERE y LIKE $like
} $res
}
}
}
do_execsql_test 6.0 {
CREATE VIRTUAL TABLE ci0 USING fts5(x, tokenize="trigram");
CREATE VIRTUAL TABLE ci1 USING fts5(x, tokenize="trigram case_sensitive 1");
}
# LIKE and GLOB both work with case-insensitive tokenizers. Only GLOB works
# with case-sensitive.
do_eqp_test 6.1 {
SELECT * FROM ci0 WHERE x LIKE ?
} {VIRTUAL TABLE INDEX 0:L0}
do_eqp_test 6.2 {
SELECT * FROM ci0 WHERE x GLOB ?
} {VIRTUAL TABLE INDEX 0:G0}
do_eqp_test 6.3 {
SELECT * FROM ci1 WHERE x LIKE ?
} {{SCAN ci1 VIRTUAL TABLE INDEX 0:}}
do_eqp_test 6.4 {
SELECT * FROM ci1 WHERE x GLOB ?
} {VIRTUAL TABLE INDEX 0:G0}
do_eqp_test 6.5 {
SELECT * FROM ci1 WHERE x < ?
} {{SCAN ci1 VIRTUAL TABLE INDEX 0:}}
do_eqp_test 6.6 {
SELECT * FROM ci0 WHERE x < ?
} {{SCAN ci0 VIRTUAL TABLE INDEX 0:}}
reset_db
do_execsql_test 7.0 {
CREATE VIRTUAL TABLE f USING FTS5(filename, tokenize="trigram");
INSERT INTO f (rowid, filename) VALUES
(10, "giraffe.png"),
(20, "жираф.png"),
(30, "cat.png"),
(40, "кот.png"),
(50, "misic-🎵-.mp3");
}
do_execsql_test 7.1 {
SELECT rowid FROM f WHERE +filename GLOB '*ир*';
} {20}
do_execsql_test 7.2 {
SELECT rowid FROM f WHERE filename GLOB '*ир*';
} {20}
#-------------------------------------------------------------------------
reset_db
do_execsql_test 8.0 {
CREATE VIRTUAL TABLE t1 USING fts5(y, tokenize=trigram);
INSERT INTO t1 VALUES('abcdefghijklm');
}
foreach {tn match res} {
1 "abc ghi" "(abc)def(ghi)jklm"
2 "def ghi" "abc(defghi)jklm"
3 "efg ghi" "abcd(efghi)jklm"
4 "efghi" "abcd(efghi)jklm"
5 "abcd jklm" "(abcd)efghi(jklm)"
6 "ijkl jklm" "abcdefgh(ijklm)"
7 "ijk ijkl hijk" "abcdefg(hijkl)m"
} {
do_execsql_test 8.1.$tn {
SELECT highlight(t1, 0, '(', ')') FROM t1($match)
} $res
}
do_execsql_test 8.2 {
CREATE VIRTUAL TABLE ft2 USING fts5(a, tokenize="trigram");
INSERT INTO ft2 VALUES('abc x cde');
INSERT INTO ft2 VALUES('abc cde');
INSERT INTO ft2 VALUES('abcde');
}
do_execsql_test 8.3 {
SELECT highlight(ft2, 0, '[', ']') FROM ft2 WHERE ft2 MATCH 'abc AND cde';
} {
{[abc] x [cde]}
{[abc] [cde]}
{[abcde]}
}
#-------------------------------------------------------------------------
reset_db
do_execsql_test 9.0 {
CREATE VIRTUAL TABLE t1 USING fts5(
a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
tokenize=trigram
);
INSERT INTO t1(rowid, a12) VALUES(111, 'thats a tricky case though');
INSERT INTO t1(rowid, a12) VALUES(222, 'the query planner cannot do');
}
do_execsql_test 9.1 {
SELECT rowid FROM t1 WHERE a12 LIKE '%tricky%'
} {111}
do_execsql_test 9.2 {
SELECT rowid FROM t1 WHERE a12 LIKE '%tricky%' AND a12 LIKE '%case%'
} {111}
do_execsql_test 9.3 {
SELECT rowid FROM t1 WHERE a12 LIKE NULL
} {}
#-------------------------------------------------------------------------
reset_db
do_execsql_test 10.0 {
CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=trigram);
}
do_test 10.1 {
foreach {val} {
"abc \UFFjkl\UFF"
"abc \UFFFjkl\UFFF"
"abc \UFFFFjkl\UFFFF"
"abc \UFFFFFjkl\UFFFFF"
"\UFFjkl\UFF abc"
"\UFFFjkl\UFFF abc"
"\UFFFFjkl\UFFFF abc"
"\UFFFFFjkl\UFFFFF abc"
"\U10001jkl\U10001 abc"
} {
execsql { INSERT INTO t1 VALUES( $val ) }
}
} {}
do_test 10.2 {
foreach {val} {
X'E18000626320646566'
X'61EDA0806320646566'
X'61EDA0806320646566'
X'61EFBFBE6320646566'
X'76686920E18000626320646566'
X'7668692061EDA0806320646566'
X'7668692061EDA0806320646566'
X'7668692061EFBFBE6320646566'
} {
execsql " INSERT INTO t1 VALUES( $val ) "
}
} {}
do_test 10.3 {
set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
execsql {
INSERT INTO t1 VALUES($a);
INSERT INTO t1 VALUES($b);
INSERT INTO t1 VALUES($c);
INSERT INTO t1 VALUES($d);
INSERT INTO t1 VALUES('abcd' || $a);
INSERT INTO t1 VALUES('abcd' || $b);
INSERT INTO t1 VALUES('abcd' || $c);
INSERT INTO t1 VALUES('abcd' || $d);
}
} {}
do_execsql_test 11.0 {
CREATE VIRTUAL TABLE t4 USING fts5(y, tokenize=trigram);
}
sqlite3_fts5_register_str db
do_execsql_test 11.1 {
INSERT INTO t4 VALUES( str('') );
}
do_test 12.0 {
sqlite3_fts5_tokenize db trigram "abcd"
} {abc 0 3 bcd 1 4}
do_test 12.1 {
sqlite3_fts5_tokenize db trigram "a"
} {}
do_test 12.2 {
sqlite3_fts5_tokenize db trigram ""
} {}
finish_test