Æ !" # $
"
# $
% "& &' ( ()*+&"++&
! " # $ % ! &$ '$ ( ! )'* &$ % ! ! + '$ , & -. ! )'* &/ '0 &1& , " " " " ' " "
"
" -./0 " "1
2 3""-40+
2 "1
" 3 " ," "5 3"#++ 2$-60+ 3 , "
" 3 " Æ , " , " " " -./0+
.
, " ""5 ' " 7 #" $ 7 & "-/0+ "" '
" + ' " " 3 " " 3
" " 5, " " + " , "
'"% " '' 3 , "1
"
'' + + ." 1
3 " , " ' '"+
, " ' ( "% -./ 8 90+ 5" '" " " "" " " ,
" '' + " , " ' " " " '
&' 3
" 1
2 3"" ' ' " ,"+ " '" " " ' "
"#++ " "" 3 :! "'$ " 1
"
+ 3" ""3
3 ." 1
+ " , " 1
" ' " "
"
, +
2"" , " " '3&"% "* " 3 1
" 7 " " 7 " 3 3-./ 80+ " '3&" " , " " ." 1
+ " "" 3 " ,
"* , " + 1
" " " 3 "" "" 1
"" ""+
3 3 ;" "* ,+ ""
, " ' -< = /0+ " " ' " , "
""" " "" , """ 1
""+ '
1
" "" "" ""+ : ' " 3 " 3 " 3 ,+ " " 5" , '" " , "# " $ ' '-8 ..0+ " "
' Æ
3 " ' ' " ,"+
<
" ' " #" $ "5 " "* " 1
' " " ,+ 5" "* , " " + > "" 5 ;"" ' ' "+ " , " " ,"" ,+ & " "
, ' ""+ ? '" "
' "
" " "
3"1
" #.$ "" "
3"1
' " #<$ ""
" ' "
3"1
+ > ' , "
3"1
" " " " +
@< , " , "+ ?" "* @< , "
"3 ' 3" "*+ "* @< , , "" " 3" "* " + " , "* 3" & 3"+ 1
@< , " " "3 '
3" "*+ 1
@< , , "" " 3" "* " & 1
& 3"+ 1
"" "" " " 1
" " , ' "" + " ""
" "3 " 6+
" " " * " '"+ < "3" ," '& ,+ 9 "" " + = "" "
" @< ,+ 6 "" @< , *" "* 1
+ A "" "
" 3 " :""
" :!B -.=0+ 4 "
*" " +
9
" " ' , ,-./0 , " "+ " " 1
& " " +
> " 5 "1
" "
3"1
+
, """ ' ( "% " " ""-/0+ ' " "5 " " "" 3
"
+ """ 5 " " ;"" ' " + ? " " " "
"" 0 ' " 5 0 " " ;"" " 1
-=0+ C"" " " "
"
" ;"" ' " " + B"" ,
"
" BD " " 1
& " "+ ?
. "'" "
,+
posting lists of terms
B+-Tree
on terms
…
a posting
d, [o1, …, of]
d: document identifier
oi: offset where term t occurs in document d
f: frequency of occurrence of term t in document d
?
.+ "
,+
, " ""5 ' " , "% #.$
'3" , " ' " #<$ , " "
-/ .. 90+ " , " + " " " " *
" '"+ <+. "3" "
1
"" ,+ <+<
"3" "+
=
, "" " " , "+ " " " "1
" -.E0+ F, " + " "
3"1
3 3 " ." 1
" " '' , " " " ''+
"1
+
?
< "'" , " "" 9 , ' " " "" G"H GH+ 9" G"H GH GH GH
, G"H ' 9" GH GH , GH+ ." 1
;" ;"" 9" , "
" 3 .+
3-grams
posting lists of 3-grams
str
7, [6, 51]
44, [12]
97, [4, 87]
tri
7, [7, 52]
44, [13]
97, [5, 88]
rin
7, [8, 53]
44, [14]
97, [6, 89]
ing
7, [9, 54]
44, [15]
97, [7, 90]
dat
7, [58]
12, [4, 27]
44, [83]
ata
7, [59]
12, [5, 28]
44, [84]
?
<+ , " "" 9 ,+
I
"" " ' ""% #.$ " 1
" " " " "" " "J #<$ ( 3' " " ""
" 5 " ( 3
-./0+
? , "
" ' ,
1
G"H 3 " , ?
<+ 5" " 1
G"H " " 9" G"H GH GH GH "
"" " 9" "+ " " ( " " "" "
5 " ' 9" G"H GH GH GH 6
"
7 "
G"H+ 5" 4 == 84 " 1
"
" " " 9" "
+
" , "
" " " , " " "1
3"-.E0+ , -./0 " "1
3"-40 +
" "" " 1
" , 3"+ , " ' " +
"
, '" " " , "+ >' " " " " " "
" "
" ' , '" " "-.60+ ' '3" , , " 3 " 3
" , "* -80+
"1
3" " "" "1
" " 1
" 2 3"+ " " " "1
" " ,"+ " " 3
+
,3" " 3 " + ?F-40
' "" , " '&' + ?F "" 9" "1
" 8
" 2 "1
" " , "+ ' " ?F "'" 3
" 3
"'" 3 " " " " "+ >'
" 3 " 3 "* , ?F " "5 3"-A0+
A
" " ' , % ' " ,"" , ' ' + ?
9
"" , " " , @<
,+
?
9#$ "'" , " "+ "1
" G H
3 "
3"1
;"" " . < + "1
" G H 3 "
3"1
B ;"" " . + "
<" G H G H G H " . < G H G H G H " . +
?
9#3$ "'" , " ?
9#$+ ?
9#3$
"" " " + " "" <
" G H G H G H ; 3' ;" G H G H# G H$ " .# <$ " " . 3
" " < + " " " "" <" G H G H G H+ " "
3"1
" " " ;""#' "
3"1
"$ " , "
3"1
'
" 3 , "+
;"" " , "
3"1
, , "*
'
3 " "
" + ?
9#$ "'" @< , " ?
9#$+ ;"" <" G H G H G H 3
" ' " " '"+ 5" 3" ;"" "
3"1
'
" " "' ?
9#$J " 3" ;"" <"
4
G
H G H G H ' "
3"1
" "' ?
9#$+
2-grams
document 1
o1
o2
... a1 a2 a3 a4 ... b1 b2 b3 b4...
A
document 2
a1a2
a2a3
a3a4
b1b2
b2b3
b3b4
...
B
o3
a1 a2 a3 a4 ...
1, [o1+0]
1, [o1+1]
1, [o1+2]
1, [o2+0]
1, [o2+1]
1, [o2+2]
2, [o3+0] N, [o5+0]
2, [o3+1] N, [o5+1]
2, [o3+2] N, [o5+2]
N, [o4+0]
N, [o4+1]
N, [o4+2]
(b) Redundancy of the position information
in the n-gram index.
...
A
o4
o5
document N ... b1 b2 b3 b4 ... a1 a2 a3 a4 ...
posting lists
B
2-grams
A
a1a2
a2a3
a3a4
b1b2
b2b3
b3b4
(a) Document collection.
posting lists
A, [0]
A, [1]
A, [2]
B, [0]
B, [1]
B, [2]
subsequences
A
B
posting lists
1, [o1] 2, [o3] N, [o5]
1, [o2] N, [o4]
(c) Elimination of the redundancy
in the n-gram/2L index.
?
9+ , " ,+
"
@< , " *
"" " " 3 # K$+ " " <+
!
?
= "'" "
@< , ' """ 3& , ,+ "" 3"
;"" "
3"1
" ' " ;"" " ' "
3"1
"+
@< , " 3
' ""% #.$ , "
3"1
" #<$ 3
3& , #9$ , " #=$ 3
,+ ,
/
posting lists
of n-grams
posting lists of
subsequences
B+-Tree on
subsequences
B+-Tree on
n-grams
…
a posting:
a posting: d, [o , …, o
1
f(d,s)]
v, [o1, …, of(v,t)]
v:
subsequence identifier
o i:
offset where n-gram t
occurs in subsequence v
f(v,t):
…
frequency of occurrence
of n-gram t in subsequence v
d:
document identifier
o i:
offset where subsequence s
occurs in document d
f(d,s):
(a) The front-end index.
frequency of occurrence
of subsequence s in document d
(b) The back-end index.
?
=+ "
@< ,+
"
3"1
" "
3"1
" " 5, 3 "
"
3"1
" '
3 .+ " " " "" " ++ '
, "" " " ""+ "" " .+ > ' # .$ "
3"1
" +
" "
3"1
+
"
3"1
" , "
"
" ' 3
. " "" +
C% , +
?
6 "'" 3
@< ,+ " + . ," "
3"1
" " " "
' 3 .+ " " "1
" +
," "
3"1
" " ' E + " "
3"1
" "" " 3& "
"
3"1
+ < 3
" 3& ,
" "
3"1
" 3 .+ ? "
3"1
" ;""
" 0 " " " + 9 ," " " "
3"1
" 3 . 3 " ."
8
1
+ = 3
" , " " 3 9+ ?
" "
3"1
;""
" 0
" " " +
Algorithm n-Gram/2L Index Building:
Input: (1) The document collection D, (2) The length m of subsequences , (3) The length n of n-grams
Output: The two-level n-gram inverted index
Algorithm:
Step 1. Extraction of m-subsequences: for each document in D
1.1 Suppose that a document d is a sequence of characters c0,c1,...,cN-1 ;
extract m-subsequences starting from the character ci*(m-n+1) (0 ≤ i < ⎣(N-n+1)/(m-n+1)⎦) and
record the offsets of the m-subsequences within d.
1.2 If the length of the last m-subsequence is less than m,
pad blank characters to the m-subsequence.
Step 2. Construction of the back-end inverted index: for each m-subsequence obtained in Step 1
2.1 Suppose that an m-subsequence s occurs in a document d at offsets o0,o1,...,of ;
append a posting <d, [o0,o1,...,of ]> to the posting list of s.
Step 3. Extraction of n-grams: for each m-subsequence obtained in Step 1
3.1 Suppose that an m-subsequence s is a sequence of characters c0,c1,...,cL-1 ;
extract n-grams starting at the character ci (0 ≤ i < L-n+1) and
record the offsets of the n-grams within s.
Step 4. Construction of the front-end inverted index: for each n-gram obtained in Step 3
4.1 Suppose that an n-gram g occurs in an m-subsequence v at offsets o0,o1,...,of ;
append a posting <v, [o0,o1,...,of ]> to the posting list of g.
?
6+ 3
@< ,+
?
A "'" , 3
@< ,+ " L < L =+ ?
A#$ "'" " "+ ?
A#3$ "'" " ="
3"1
"
, "+ ="
3"1
" , "
3 .#++
.$ " , E GBH GBH GBBH+ ?
A#$ "'"
3& , 3
" ="
3"1
"+ ="
3"1
GBH " ;"" E 9 A " E 9 = " "" E -E0
9 -90 = -A0 " " ="
3"1
GBH+ ?
A#$ "'" " ="
3"1
" 5"+ ?
A#$ "'" " <" , .E
="
3"1
" ?
A#$+ <" , 3 ." 1
" ,
="
3"1
E GBH GBH GH+ ?
A#$ "'" , 3
" <"+ < GBH " ;"" E < . < ="
3"1
" E 9
= 6 " "" E -E0 9 -<0 = -.0 6 -<0 " " < GBH+
4-subsequence window
4-subsequences
ABCDDABBCD
ABCD
document 1
DABCDABCDA
BBCD
document 2
CDABBCDDAB
BCDA
document 3
BCDABCDABC
CDAB
document 4
DDABCDABCD
DABC
document 5
BBCDABCDAB
DDAB
document 0
(a) The document collection.
(b) The set of
4-subsequences.
ABCD
BBCD
BCDA
CDAB
DABC
DDAB
subsequence 0
AB
subsequence 1
BBCD
BB
subsequence 2
BCDA
BC
subsequence 3
CDAB
CD
subsequence 4
DABC
DA
subsequence 5
DDAB
DD
(d) The set of 4-subsequences.
(e) The set of 2-grams.
0, [0]
0, [6]
1, [6]
1, [3]
1, [0]
0, [3]
3, [3]
2, [3]
3, [0]
2, [0]
3, [6]
2, [6]
4, [6]
5, [0]
4, [3]
5, [6]
5, [3]
4, [0]
(c) The back-end index.
2-grams
ABCD
posting lists of
4-subsequences
AB
BB
BC
CD
DA
DD
posting lists of 2-grams
0, [0]
1, [0]
0, [1]
0, [2]
2, [2]
5, [0]
3, [2]
4, [1]
5, [2]
1, [1]
1, [2]
3, [1]
2, [0]
2, [1]
4, [0]
4, [2]
3, [0]
5, [1]
(f) The front-end index.
?
A+ , 3
@< ,+
" " ' " "" 1
" " @< ,+ ? "
," ' ' , 1
" "" " "+ :
3 , , 1
" 3 1
" "" ""+
1
"" """ ' ' ""% #.$ " , "
" #<$ " 3& , 5 ..
"
"+ 5" " ' " "
3"1
" 1
" 3 " , ' " , 1
"+ "
3"1
" ' 1
" 5 " "+ " " ' " " " "
3"1
" 1
" 3 " 3& , ' "
3"1
" 5" "+ " "
3"1
" 5" " " " "
" "" "" #++ $
1
+ 5 "
" 3 3 " " 3 5 " " ""+
2' ' 5 5 . 5 9+
"
3"1
1
" "" ' "% #.$ "
Æ, " 5, J #<$ ' " " "
3" J
#9$ 5, " "
Æ, J #=$ "
3" " ' " +
," "1
"1
" "1
+ #.$ ? "1
"" ' "1
"% L L + " "
Æ, 5, 3 #++ L ' E+$ # $ L + #<$ ? "1
"" '
"1
"%
# $ L
#
# $ $+
% "
3" 1
" "" '
3 "1
"
3"1
" ' + # $ " ' " +
" "
3"1
"
?
4 "'" ," + > " 1
" "
3"1
+ ?
4#$ " " "
Æ, " 5, + ?
4#3$ " " GBH " "" " 5 .+
.<
A B B C
S
Q
B B C D A
S
A B C D
Q
B B C D A
(b) S does not cover Q.
(a) S covers Q.
?
4+ F," "
3"1
1
+
" " "
3"1
"
" "
3"1
+
1
" "
C% 5" "' "" " "
3"1
" " ?
/+ # $ 3
+ "" "" ' # $ #?
/#$$ # $
#?
" /#3$ #$$+ ?
/#$ " " + ?
/#3$ " " + ?
/#$ " " + ? ?
/ ' " " " " "
3"1
""5" 5 . +
Si
Si+1
...
...
for Len(Q) ≥ m
Sj
...
Q
(a) {Si, Si+1, ... Sj} contains Q.
...
Sk
Sp
Sq
...
...
Q
...
for Len(Q) < m
Q
(b) {Sk} contains Q.
(c) {Sp, Sq} contains Q.
?
/+ "" " "
3"1
" " +
?
8 "'" "" 1
" " @< ,+ "
+ . "" 1
" " "" " "" " " ,+ ( " " "" " "
3"1
5 " ( 3
" "
3"1
" 3 5 .#++ "
3"1
" ""
.9
"" .$ " + "
3"1
" " , " ( .+<+
> "" ;" "" 3 ( & ' "
3"1
" + < " ( " "" "
3"1
" " 5 " ( 3
+
5" " "
3"1
" " 5 "
5 3 & ' " 5 9+ " 3 "
3" " ( <+.+ > ""
;" "" 3 ( & ' "
+ " " " 1
"
+
Algorithm n-Gram/2L Index Searching:
Input: (1) The two-level n-gram inverted index
(2) A query string Q
Output: Identifiers of the documents containing Q
Algorithm:
Step 1. Searching the front-end inverted index:
1.1 Split Q into multiple n-grams and search the posting lists of those n-grams.
1.2 Perform merge outer join among those posting lists using the m-subsequence identifier as the join attribute;
add the m-subsequences that cover Q by Definition 1 into the set Scover.
Step 2. Searching the back-end inverted index:
2.1 Perform merge outer join among the posting lists of m-subsequences in Scover
using the document identifier as the join attribute.
2.1.1 Identify the set {Si} of m-subsequences having the same document identifier di and
perform refinement by checking whether {Si} contains Q or not according to Definition 3.
2.1.2 If {Si} contains Q, di is returned as the query result.
?
8+ "" 1
" " @< ,+
.=
" # $%% & ' " " ' " "" @< ,+ 6+. ' @< , " 3 " ,"" ,+ 6+< ' " " ," " ,"+ 6+9 ' " ," "+
!"
" " ' 3" " ," ,
" " 3 # K$-.4 .0 "' @< ,
3 3 " ?
2
?# !" $+
? "& ' 5" " " , " " 3 ?" 2 ?#.2?$+ " !#$ + "
" 3
" 2 :+ > 2 " " 5" : ;""+ ?
' " 3 3 3
3 " 3
: ' 3
" : :+ " !#$ $ + " " 5 3
" 2 : :+ > " "
3"1
" : ;"" "
' "
3"1
" : ;"" "
3"1
" ' "+
" 3
" : : 2: : 3 " 3
" 2 : 2:+ " " "5 ;" 3" "
3"1
" "'
.+ #
$ 2:
$ " 3 #
"" "
3"1
;" 3"
+ " ;" ' " "
3"1
;" ' "
3"1
" +
.6
?
.E "'" < , 3
" ?
A#$+ ?
..#$
"'" 2: " ,+ ?
..#3$ "'" 2: :
# L =$ 2: ?
..#$+ > " 2: :
" 3 " 3
+ ?
.. & 2: : "
3 & 2:+ < GBH ;" . E 3" ="
3"1
GBH ?
A#$ 3
& 3" GBH+ 3
: 3" . 3
" < GBH
" ="
3"1
GBH ;" . 3
: 3" E 3
"
="
3"1
GBH " E ;" E+
2-grams
posting lists of 2-grams
AB
0, [0, 5]
1, [1, 5]
2, [2, 8]
BB
0, [6]
2, [3]
5, [0]
3, [3, 7]
4, [2, 6]
5, [4, 8]
BC
0, [1, 7]
1, [2, 6]
2, [4]
3, [0, 4, 8]
4, [3, 7]
5, [1, 5]
CD
0, [2, 8]
1, [3, 7]
2, [0, 5]
3, [1, 5]
4, [4, 8]
5, [2, 6]
DA
0, [4]
1, [0, 4, 8]
2, [1, 7]
3, [2, 6]
4, [1, 5]
5, [3, 7]
DD
0, [3]
2, [6]
4, [0]
?
.E+ , ,+
N
D
O
AB
AB
AB
AB
AB
AB
AB
AB
AB
AB
AB
AB
BB
BB
BB
BC
BC
BC
BC
BC
BC
BC
BC
BC
BC
BC
BC
0
0
1
1
2
2
3
3
4
4
5
5
0
2
5
0
0
1
1
2
3
3
3
4
4
5
5
0
5
1
5
2
8
3
7
2
6
4
8
6
3
0
1
7
2
6
4
0
4
8
3
7
1
5
CD
CD
CD
CD
CD
CD
CD
CD
CD
CD
CD
CD
DA
DA
DA
DA
DA
DA
DA
DA
DA
DA
DA
DA
DD
DD
DD
0
0
1
1
2
2
3
3
4
4
5
5
0
1
1
1
2
2
3
3
4
4
5
5
0
2
4
2
8
3
7
0
5
1
5
4
8
2
6
4
0
4
8
1
7
2
6
1
5
3
7
3
6
0
(a) An example of the NDO relation.
S
N
O1
D
O2
ABCD
ABCD
ABCD
ABCD
ABCD
ABCD
ABCD
ABCD
ABCD
BBCD
BBCD
BBCD
BBCD
BBCD
BBCD
BBCD
BBCD
BBCD
BCDA
BCDA
BCDA
BCDA
BCDA
BCDA
BCDA
BCDA
BCDA
AB
AB
AB
BC
BC
BC
CD
CD
CD
BB
BB
BB
BC
BC
BC
CD
CD
CD
BC
BC
BC
CD
CD
CD
DA
DA
DA
0
0
0
1
1
1
2
2
2
0
0
0
1
1
1
2
2
2
0
0
0
1
1
1
2
2
2
0
3
4
0
3
4
0
3
4
0
2
5
0
2
5
0
2
5
1
3
4
1
3
4
1
3
4
0
3
6
0
3
6
0
3
6
6
3
0
6
3
0
6
3
0
6
0
3
6
0
3
6
0
3
CDAB
CDAB
CDAB
CDAB
CDAB
CDAB
CDAB
CDAB
CDAB
DABC
DABC
DABC
DABC
DABC
DABC
DABC
DABC
DABC
DDAB
DDAB
DDAB
DDAB
DDAB
DDAB
DDAB
DDAB
DDAB
CD
CD
CD
DA
DA
DA
AB
AB
AB
DA
DA
DA
AB
AB
AB
BC
BC
BC
DD
DD
DD
DA
DA
DA
AB
AB
AB
0
0
0
1
1
1
2
2
2
0
0
0
1
1
1
2
2
2
0
0
0
1
1
1
2
2
2
1
2
5
1
2
5
1
2
5
1
3
5
1
3
5
1
3
5
0
2
4
0
2
4
0
2
4
3
0
6
3
0
6
3
0
6
0
6
3
0
6
3
0
6
3
3
6
0
3
6
0
3
6
0
(b) An example of the SNDO1O2 relation.
(sorted by attribute S)
?
..+ , "' ," K 2: :+
.A
' K" 2: : #++ ,$ <+
K" 2: : 2: :+ >
" "
&+
C% B 5 K-.< .4 .0 " ' "
3"" ' " ' " ' - 0 L - 0# "
3
" $ " " " "" "
3'+ " ¶ L #++ L $# "" 3
"$-.4 .0+ " K " 3
" # $
" " -.A0+
.+ - 0 L - 0 L - 0 L - 0
<+ - 0 L - 0 9+ - 0 L - 0 0L -
0
0L -
0
3 " "
3"1
" , " " , "
3"1
" " ' "#. $+ " " " , "
" " " " "+ > " " '" " 2: " " " , : " " " '" " + " ! L 2: : M L L 2: !M L : + " 3 ""5 3
" " #!M$
" " " " " M
"+ "
" 3 ""5 " ' L + " 2: ¶ : ¶ 2: L
2: : : L 2: :+ " K" 2: : 2: :+ " 3
" "
& " "' , ?
..#3$+
!
!
! !
!"
.4
K" 2: : 3
" " " '
"
3"1
" " " ' , "
3"1
+ 3
" K"
-.< .A .4 .0+ 2: : 3' "
" " " ," " " ""3 3"
" " "
3"1
+
?
..#3$ "'" , "' ," K" 2: : 2: :+ " " 2: : "'
?
..#3$ ,"" : " #E E$ #9 9$ #= A$ 2: " #GBH E$ #GBH .$ #GH <$+ " 2: " :
" " " '" " GBH+ "
" " "+
2: : " ?
2 ?#=2?$+
C% K 2: ,"" ' " "
&+
3& ," @< , ' " 3
3 " 2: : " " 3 =2?+ " " <+
" 3 " 3 K " ," @<
,-.<0+
" #2: :$ " =2?+
C% , B
=2? " #2: :$ 2: : " 3& ," @< ,+
C% 2: " " " , 3 2 : " "
3"1
5 ;" "+ : " " " ./
3& , 3 : " 5 ;" "+
=2? " #2: : $ 2: : " 3& ," @< ,+
?
.< "'" 2: : ?
.. " " '
" 2: : + 3
: " "" "
3"1
5"+ " : " 3 "
3"1
5+
" " 2: : ?
.. " " " " 2: : ?
.<+ ++ " 3' 2: : ?
.. " 3 ?
.<+ " "
2: : ' " , 3 3& ," ?
A "+
S
0
3
4
5
1
0
1
2
4
0
1
2
3
2
3
4
5
5
N
AB
AB
AB
AB
BB
BC
BC
BC
BC
CD
CD
CD
CD
DA
DA
DA
DA
AB
S
ABCD
ABCD
ABCD
BBCD
BBCD
BBCD
BCDA
BCDA
BCDA
CDAB
CDAB
CDAB
DABC
DABC
DABC
DDAB
DDAB
DDAB
O1
0
2
1
2
0
1
1
0
2
2
2
1
0
2
1
0
1
0
SNO1 relation
AB
BB
BC
CD
DA
DD
0, [0]
1, [0]
0, [1]
0, [2]
2, [2]
5, [0]
D
0
3
4
0
2
5
1
3
4
1
2
5
1
3
5
0
2
4
O2
0
3
6
6
3
0
6
0
3
3
0
6
0
6
3
3
6
0
SDO2 relation
posting lists
2-grams
(0)
(0)
(0)
(1)
(1)
(1)
(2)
(2)
(2)
(3)
(3)
(3)
(4)
(4)
(4)
(5)
(5)
(5)
4-subsequences
3, [2]
4, [1]
5, [2]
1, [1]
1, [2]
3, [1]
2, [0]
2, [1]
4, [0]
4, [2]
3, [0]
5, [1]
ABCD
BBCD
BCDA
CDAB
DABC
DDAB
The front-end index in Figure 6
0, [0]
0, [6]
1, [6]
1, [3]
1, [0]
0, [3]
posting lists
3, [3]
2, [3]
3, [0]
2, [0]
3, [6]
2, [6]
4, [6]
5, [0]
4, [3]
5, [6]
5, [3]
4, [0]
The back-end index in Figure 6
?
.<+ "
" 2: : ?
.. ' "+
.8
"
" ; "* @< , " "
3"1
"+ " 3 "+ : " 3 ' @< ,+ " " ' * "*
@< , " *"
, "*+ 3 +
3 . ' "
* " 3" " 3 " * "* @<
,+ > ' " , "* " 3 ;"" " 3
" " , " " " "-/0+
3 .+ " 3 " * "* @< ,+
3"
5"
"* ,
"* ,
"* 3& ,
Ë
" 1
"
3"1
" , 3 " , "
3"1
# $
1
"
3"1
# $
# $ ' Ë #L # Ë # $$ Ë$
#Ë$
# $ ' Ë #L # Ë # $$ Ë$
#Ë$
#
#
#
$
$
2' ' 5 Æ
% 5 =+
Æ
% " "* , @< ,+ "
Æ
%
L
<E
#
# D
#
#.$
" Æ 5 = " ?
" #.$#6$+ 3 ;"" , " 3 " 2: :+ 3
" 2: : " 1
;"" , " 2: : " 3 * , .2?+ " < 3
2: : " 3
" 2: " 3
"
: " + " 2: : 3 " " 3" # $ # $+ "* , 3 "
?
#<$ ++ "
# $ # $ 1
"
3"1
"+ 3 "*" , 3& , "+ 2: " , 3 " " 3" # $+ > "* , " " ?
#9$ ++ "
# $ 1
"
3"1
"+
: " 3& , 3 " " 3" # $+ > "* 3& , " " ?
#=$ ++ "
# $ 1
"
3"1
"+ "1
' 3 ?
#6$ "
Æ ?
" #.$#=$+
#
L
#
# #
Æ
%
Ë
L
L
#$
#9$
#$
#=$
Ë
Ë# #$ Ë# #$ D
Ë
Ë
#<$
#$
# $$
Ë
L
# $$
# $ $
#Ë$$
#Ë$$
#
#
#Ë$ #Ë$ D
#6$
" Æ " 3 " Ë # $ # $ ' 3 3
3 "" + " 3 3 "1
" #++ #
$$+ ' 5" " Æ"
" #
<.
" " " " ,
"
Æ+ F, "
" "' " , # D.$ # D9$
++ 3 . 9 .E B".B"+
?
#6$ "'" " , , " #Ë# $$
' @< , " #Ë# D $$+ " 3 ?
#6$ " Æ " ,* ' " 1
+ > "" " 3" "* " " " " + " 3" 3" " ' " 3 ,
" Æ+ " 3
3 " 3" "* ""+ #
$ ""
# D $ " " Æ "" " 3" "*
"+ @< , " " , "* 3"+
"
"
" ; 1
@< , # $ 1
" + " " ' 3& "" 1
" " "+
? " "" ' 5" & ' ' ""
"% #.$ 1
""
" 3 ;"" "" 3 " "" ""+ " ; " "" " " " "& "& " "J #<$ "* " " ""3 3" "#LN$ "
3"1
"#LN$ ' N " 3
, , # , ' N L <A L 6 N L .. //. 94A$+ 1
"" " " 3" " ""
" "3+
1
, @< , " 3 " ?
" #A$#8$+ 3 3 ;"" " " 3
<<
3 " "" "" 1
""+ 3 ;"" ""
1
"" " + , " " "
# # $ D .$ 1
"" " " ?
#A$+ , @< , " " " # # $ D .$ 1
"" " "
?
#4$+ 3& , @< , " + > " 3 "
3"1
" " ; ' # $
# $ + # $ 3 "
3"1
" ?
/#$ "
# # $ D .$ "
3"1
" " N + # $ 3 "
3"1
?
/#3$ " # # $ D .$ N "
3"1
" " N + > 1
"" 3&
, " " ?
#/$+ ? ?
#8$ "'" 1
"" "+
L D
L
#
L
# N
N
# $ D. D <
#
L # # $ D .$
#A$
# # $ D .$
#4$
# $ D .$ N
N
D<
N
µ ·½
¦´
' L <
B "
3"
' Ë#
, " #Ë# #
"
# $ L <
# $
# $
# $ #8$
"
<9
#/$
# $
$ ?
#8$ "'" ,
$$ ' @< , " #Ë#
D
´µ·½
¦ ´µ # $
$$+ " ," " ," " ,"+ , " @< , " " 1
" ' , " " 3"
"* " +
? ?
" #A$#8$ ' 1
"" , ""
# $+ " 1
"" @< , "" "+ , 1
"" "" # $ 3
3
" " 1
"" 3
" , "* " "+ "* , " " , 3
" "* "
3"1
" " " "* "# , ' "* " .B" L6 "* " "
3"1
" " .9 <4 B"$+
?
3& , # $ ;" 1
"" " N "
# , ' N L <A L A L 9 N L .4 64A$+ " " " ,
" " 3 1
, 1
"
" 3 3-A0+
* 1
"" " ' "
& "
"" + " " "& 1
" "+ " "" " + > 3 " ?
" #A$#/$ '
" "" " "' ?
" #.E$#.<$+
%
&
L %
%
%
&
L
%
# # $ D .$
#.E$
&
L
%
# # $ D .$
#..$
# $ D. D <
#
# $ D .$ N
?
" #.E$ #.<$ &
<=
N
L
&
D<
N
# $
# $
#.<$
+ " " "" @< , " , 3
+ ?
#.<$ " < N
# $ < N
# $ ' " " " , " " + > ' " # .$
" "
3"1
" ' "5 1
' "5 " , "*+ "1
' " # .$ A+<+
&
( )& #
$ % $&
" ," " "' "* 1
@< ,
"
" ,+ " & 5 ?
#.9$ "
"
, "* 3 """ ' & 1
+
& L
#.9$
," " ' ""+ 5" " " F" ,
3"" O C ?! !F 3"" O " + " "" .E B" .EE B" .B" ' " "" " " 3"
+ " !F.E !F.EE !F. "+ "
" " "1
3"" O + 2B B
'3 " O " 3"+ " "" .E B" .EE B" .B"+
" C!: F2.E C!: F2.EE C!: F2. "+ "
"" " " 3" !F 3"" & " !F C!: F2 " ,
P
"
" ,"+
½
, "* ' "
" , "* " C!: F2
<6
!F 3"" ' + " " Æ #?
#6$$ " =+< " , "* + ' "' " " + , , ' " 3 9 ' " " "
"-.9 40+ B"" ' 3& , ' = #++ D .$ # D .$+
1
' "
3 """ ' &
' 3" "* 1
+ " ; 3" "* '
3
3"" .E B" .EE B" .B" " C!: F2 !F
"+ > ' " .EE " ' " 1
" '" "
9./ " "
+ " ; 1
' 1
" '"% 9 A 8 .< .6 ./+ 1
" 9 ' " " "
L 9+ Q" C!: F2. !F. ' " 6E " ' " 1
"
" "
+
," C
<+A >* , C ' .B" =EEB" FF "&"+ 3
; ; 2QM 5 "" "& @:" ' " ' "&" , 5"+ " , :""
" :!B -.=0 ,"+ "* ," " " 3
=E8A 3"+
#
' $
?
.9 "'" " , "* " " 3" "* "
3"1
"
C!: F2 3""+ " "
" "* @< , "
"5 ' ,+ ?
.9#3$ "'" "* @< , ' "
3"1
" " " 3 " 3 .+4 "
C!: F2.E 3 <+< " C!: F2.EE 3 <+4 " C!: F2.
<A
' ,+
estimated
index size ratio
3.5 optimal length mo
3
2.5
2
1.5
1
0.5
0
4
PROTEIN-100M
real index size ratio
PROTEIN-1G
5
6
3
2.5
PROTEIN-10M
optimal length mo
2
1.5
1
0.5
0
4
5
6
subsequence length m
subsequence length m
(a) The estimated index size ratio
as m is varied.
(b) The real index size ratio
as m is varied.
?
.9+ " , "* C!: F2 3""+
" ?
.9#$ " ?
.9#3$+ ?
C!: F2.E C!: F2.EE C!: F2. 3 " = 6 6
"+ B"" , "* " " " " /AR 8/R " "' " "#<R .=R$ "+ " "
" "" 6+<
" +
?
.= "'" " , "* !F 3""+ " " + >' , "* " "' 3 6<R
4ER " "' " ' ?
.9+ " 3
"
, "" 5" "
3"1
5" ;"" ' ' ;"" "+ 5 # "
3"1
$ ;"" ' " " # ?
=$+
> " 1
5" " @< , 3
" 3 ;"" " " " , " & " + " ; 3 ;"" " ;"" , " @< ,+ " " & ?
.=
?
.9 3
" !F *" " '" ,"""
<4
1
+
TREC-1G
TREC-100M
estimated
index size ratio
real index size ratio
4 optimal length mo
3.5
3
2.5
2
1.5
1
0.5
0
4
5
6
7
2.5
TREC-10M
optimal length mo
2
1.5
1
0.5
0
4
5
6
7
subsequence length m
subsequence length m
(a) The estimated index size ratio
as m is varied.
(b) The real index size ratio
as m is varied.
?
.=+ " , "* !F 3""+
3 < "'" , "* "" " 3" "* "+ > ,
, "* " 3 3 " " "
3"1
"+ # " " ""
' ' " # .$ * 1
$ " "
5" "" 6+<+
3 < " 3" "* " 3 .E B" .B" , "* "" 3 <6R C!: F2 3"" 3 <8R !F 3"" ' L + "" 3 <R C!: F2 3"" 3 <.R !F
3"" ' L .+ > " " 3 " "
L . L 9 C!: F2.E +
3 <+ , "* " 3" "* " +
"
.E B"
.EE B"
.B"
C!: F2
.+49= # L=$
<+.69 # L6$
<+4E6 # L6$
2@ # .L9$ .+/=4 # .L=$ .+/44 # .L=$
!F
.+994 # L6$
.+A4/ # LA$
<+<.8 # LA$
.+</. # .L=$ .+A44 # .L6$ .+/4/ # .L6$
</
!
"
?
.6#$ "'" 1
"" , @< , " 3"
"* " C!: F2 3"+ > ' " "
3"1
" # .$+ "
3 , 6+9 @< , "5 " 1
' ,+ ?
' 3 " 3"
"* " + ?
.6#$ "'" 1
" .+94 " C!: F2.EE B A+A6 " C!: F2.B+
?
" .6#3$ #$ "' 3 """ 1
"" " 1
" C!: F2.+ " " @< ,
" # $ " ' " ,+ ?
.6#3$ ?
.6#$
" # $ " 9 ./ 3 """ @< , "" 3 <4R ' & 3 69R ' " , "
3 .<+E " 3 9<+8 " "+ ; ' & 7 ' " 1
"
" ", " 7 " 3 .9+. " ' " ,+
3-gram index
)s100000
m(
em10000
iT
kc 1000
ol
Cl 100
aW 10
10M
100M
1G
data size (Byte)
(a) The query processing time
as the database size is varied.
(Len(Q): 3~18)
3-gram/2L index (m=4)
se 25000
ss 20000
ec
ca 15000
eg 10000
ap
fo 5000
# 0
3
6
9
12
15
18
)s 20000
m(
em 15000
iT
kc 10000
ol
Cl 5000
aW
0
3
query length Len(Q)
(b) The number of page accesses
as Len(Q) is varied.
(data set: PROTEIN-1G)
6
9
12
15
query length Len(Q)
(c) The query processing time
as Len(Q) is varied.
(data set: PROTEIN-1G)
?
.6+ 1
C!: F2 3""+
?
.A "'" 1
!F 3"" "' " C!: F2 3""+
<8
18
3-gram index
)s 100000
(m
em 10000
iT
kc 1000
loC 100
la
W 10
10M
100M
3-gram/L2 index (m=4)
se 60000
ss
ec 45000
ca
eg 30000
ap
fo 15000
# 0
1G
data size (Byte)
(a) The query processing time
as the database size is varied.
(Len(Q): 3~18)
3
6
9
12
15
3-gram/L2 index (m=5)
18
)s 40,000
(m
em 30,000
iT
kc 20,000
loC 10,000
la
W
0
3
6
9
12
15
query length Len(Q)
(b) The number of page accesses
as Len(Q) is varied.
(data set: TREC-1G)
(c) The query processing time
as Len(Q) is varied.
(data set: TREC-1G)
?
.A+ 1
!F 3""+
* + % %
" ' " @< , "5 " "* "
1
' ,+ " 5 " ,"" , +
' "
, ' ""% .$ , . "
3"1
" " 3
3& ,J <$ , "
" "
3"1
" 3
,+
* " @< ,+ ?" ' < " ,"" , "
K+ ' 9 < , " 3 * "" "" , =2?+ ' * " , " #++
#Ë# D $$
$ * , "*+ " , , "
, " #Ë# $$ , "* 3"
& " 3" "* " + ' * ,+ "
"
18
query length Len(Q)
9E
, " "' 3 " " " , 1
3" & " 3" "* " + B"" ' '
" 1
"" 3 " "5 , "*#++ 3 " # .$ " "
3"1
"+$ ?
' "' 1
"" "" "
" 1
" 3 " ?
#8$+
," ," "* 1
@<
, " "
3"1
" 3" "* 1
+ "
# .$ " "
3"1
" " 1
""+ F, "
" "
, 3"" .B" "' "* @< , " 3
.+8#C!: F2. L =$ <+4#C!: F2. L 6$ " " 1
7 ' " 1
" " ", " 7 " 3 .9+.#C!: F2. L =$ " ' " ,+
: " "
" @< , " ' "
,+ , , " " 3 Æ , #++
2 "1
"$ ' " + " " +
, " '& '" "
3 F ?
#:F?$ !" # $+ '
& & "
'" 3 " 1
" +
& %
-.0 3 3"* > ?+ + " # % '
'
> = + <EE.+
9.
-<0 " ; " S3 G, 5" " , H (') *
+
% K+ .= 2+ = + 9=8O948 :+ .88A+
-90 F " 2" GC 3 2 B" ! "H ,
# K+ . 2+ 6
+ .O<6 + <EEE+
-=0 ?& >
F+ " " " S3 G"" ,"
?" I
F
H -+ . '
+ / ! ? + <<<O<<8 + <EE<+
-60 * 2 G , H (') '
%
K+ 99 2+ . + 9.O// + <EE.+
-A0 >
F+ " G !H -+ (
#
'
<EE9+
-40 >
F+ " " S3 G, ! 3""H 000 *
+
1
# 0
K+ .= 2+ . + A9O4/ +@?3+ <EE<+
-/0 + + ; + B )
%2 '
#
C
3"" " " < + .888+
-80 " 5 C
2 G 2 H -+ . '
+ / ! + =.6O=.A @
+ <EE9+
-.E0 + G!
" >" ?
" "H (') *
+ % K+ .6 2+ 9 + <8.O9<E .884+
-..0 > GQ" " , !H -+ . '
+
/ ! S
'* + <.AO<<= .88A+
-.<0 ; + Q -
# 1
C"" Q .8//+
9<
% 3+ -.90 & G 1
" " ,H (') '
% K+ <= 2+ = + 944O=98 + .88<+
-.=0 " & > G:""
"%
>C :!B ' ! ?
"H -+ 000 .
'
+ # 0
4'#05 & + <EE6+
-.60 :' "
" "
G:* 1
,H -+
. '
+ / ! 3
" + 9A4O9A/ .88/+
-.A0 !
!&" # )
% '> .88/+
-.40 !* F" & B+ 2 "
# % " "
= + <EE9+
-./0 ! B*" B !32 )
/ C"" .888+
- )& & ' " " " "1
" 3 +
" , " D. 5" " #E D.$+
B"" " " " "
3"1
+ 3 "
3"1
" ,
" 5" "
3"1
" #E $+ ? , # 3 ' " 3 "
3"1
' " + ( ' "
3"1
" , " , " , +
" ' 3 + ' ' ""% #.$
L . #<$
. #9$
.+ 3 & " '
" "" " , +
' &
&
& & $ & L .%
' 3 . L D D .+ " " , J " + "
99
(
" '
3 "" +
$ & .%
, " ""
L < '
"" + ' 3 < L D D<+ " " , J
" + " , " ""+
&
(
$ & .%
" ""
L '
"" + ' 3 L D + " " , J "
+ " , " +
(
&
"
"
3"1
" , "
' 3
" "" +
. .- )& & KT" 2: : 2 2: 2: + " : : : :+ " KT" " =2?+
9=
© Copyright 2026 Paperzz