Combining Text Mining and Sequence Analysis to Discover Protein

Combining Text Mining and Sequence Analysis to Discover Protein Functional Regions
E. Eskin and E. Agichtein
Pacific Symposium on Biocomputing 9:288-299(2004)
Æ ! ! " # # ! "
! # "
$
% & "
' "
! $
$
$
$
( Æ ) $
(
# *+, (
-+, (
' $ (
" '
.
" (
/ ' $ " # "
$
$ # $ & $ $
$
! " " 0 (
0 .
(
(
1 $
$ " (
.
2332 ! 4 (
0 & $ " $ Initial (seed)
labeled sequence
text annotations
text
Seq.
Extended
training set
Train Text
Classifier
text
Seq.
Predict class of
unlabeled text
annotations
Predict class of
unlabeled sequences
and annotations
text
Step 1: Extend Training Set
by Exploiting Text Annotations
Train Joint
Classifier
Seq.
text
Seq.
Step 2: Exploit both Text and Sequence
information in the Extended Training Set
$% & '
( ) *
+
! .
2332 (
)
(
" 5
6.7 !
! $
) * )
$
$
1 $ " ! $ " 1 $ & $ " % & " $
! $
& $ " $
$
7 "
'
" ' $ "
8
$
% *99: !
! Æ $ ! $ $ $
) $
0' 5#! $
1
Æ (
$
!
.
! $
7 $
$ " $ 0 $ $
7
" '55; $ "
'55; $ $
"
$
!
"#
;
$ 0 " $
$ ' $ & % $ $
7 $
Æ 1 ! $ & $ "
" #$
! $ ! " ;
* < 3 7 $ $ = ¾ 7 $ 0 Æ ) $ $ = % #$
"$ ! 23 ) * 3 ! = ¾ ! 7 Æ ' $
' 4 $
$
! /
8 ;
! " = *
3 * ' ( * $
( 6
&'( " ) %!
0 " $ ' $ " !
$ 0 " = > >
> ! " 4 $ ! ' $ " 4
$ ** ) !
" ? $
! "
$
" + =
* " Ê * 0 "
, , Ê Ê "
=
, " > '
2
" " " - " -
"
! " 4 $( 7 (
" " ) $
" " $ % &
'
'
# " $
! " =
= -
,
@
= = , =
! " % $ , ' Æ 0 ) , ! " $
0 " ' $
. !$! / )$0)
0 ) $ $
& $
" 23, (
! 0 (
0
& $ " $ " )
100
keywords
text fields
text-all
95
90
Precision
85
80
75
70
65
20
30
40
50
Recall
60
70
,% ' )
# # * -**. /0 . (
0 "
$
(
$
0 0' 5#!@3 $
' " (
7 0' 5#! $ (
0 $ (
!
2
7 *33333 0' 5#! *+, $ (
(
)* +
!
0 ,
$ $
1 (
$ % ( $
(
(
$ ! (
.
' ' '
80
'55; " (
0' 5#! $
! " *@@+@ $ ! $
(
0' 5#! ! .
2332 ,
!$ "
'55; ! $
" 0 .
$ " $ "
$
! $
$
) 2 0 " 4 .
"
7 ) 2 $
"
(
) $
:3, $ '55; "
! $
" $
0' 5#! ! 0' 5#! ) 2
0 "
! "
A
B $ . $ " $ C2333 $
"
! $ & $ "
!
"#
# $ -2 $ " $ $% ( ! # ' ( 0 $ 23, 0 D $ " " & $ " ) " 4 ! $ " $ " " ) $
$
) (
" !
2
0 #8 4 ! #8 E E " +3 7 * 3 +3 $ % (
'
'
)*
0 (
2@ (
(
( .F 0
$
23 .F : !
2 .F #
&&&&&&&
#&#&&
&&+&&+&&,&&+&&+&&+&&,&&+&&+&&+&&
.../../././...
0#1/0#...)2.& ##..#.. #..#
"/#
+2!&20&##45,
!"../&#3!,!" !"
"
&&&&&
&&
#&#&&
&&+&&&"&&
#..#.#.#.
...& ###..#
6!7&&##
/6!27&#3
!"
$
#% '
()*
(-*
'
(3*
'
'
,% ' # 12* % 3)4 5 + 5
6 7 5 ($889:$:8;;09 354
< 5
( 7 +$888,9=098 3(4 ( 5 7 ( 5 $88>$>$//8$/9>
1 !!!
0 $
$
$
Æ $
$ $
! "
$
0 & $ " $ & " 0 (
!
"
0 (
0 $ $
(
0 * ; '
8 &
(
"
(
-
. **@G*23 777' 5
*99+
2 7 H % ' H &
6 D 7
-
. 2-+D*+3*G*+-* *99@
- H H
8 %
@
+
C
I
:
9
*3
**
*2
*-
*@
*+
*C
*I
*:
. *@*3D:@CG:+C
*99:
% 8 8 &
*9*D++GI2
*99@
8 F ; ; 0 . ! D 7 "
' &
&# ,
$ . $ /&.0 H
< 2332
8 F ; ; 0 . "
' &
+
1
( &
2 /1(&0 2332
8 F ; ; 7 8 0 . "
!
8
1
233-
.
% ' (
$
. *: *D I:G :C J 2332
.
% (
&
***2D2:-CG@I 6 2332
7 % ! 8 ' &
334 "
" $
)
*99:
H !
7
H 7
! $
$
-
)
'
@DCIG:* 233-
7 %
7 ! 0' 5#! D -
-
I+D-*2G-*C *99I
0 0 8 )
4 ' (
"
-
)
*99+
; ; 7 !
1 233-
; ; 0 . K 7 " !
1 233-
. )
*99:
? . F 1 #8 " $
"
23*D2+G-- *99C
.
5 8
% .F D (
1
+ '
-**D-9IG9 J
233-