1 |
/********************************************************************** |
2 |
fingerprint.cpp - Implementation of fingerpring base class and fastsearching |
3 |
|
4 |
Copyright (C) 2005 by Chris Morley |
5 |
|
6 |
This file is part of the Open Babel project. |
7 |
For more information, see <http://openbabel.sourceforge.net/> |
8 |
|
9 |
This program is free software; you can redistribute it and/or modify |
10 |
it under the terms of the GNU General Public License as published by |
11 |
the Free Software Foundation version 2 of the License. |
12 |
|
13 |
This program is distributed in the hope that it will be useful, |
14 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
GNU General Public License for more details. |
17 |
***********************************************************************/ |
18 |
|
19 |
#include "config.h" |
20 |
#include <vector> |
21 |
#include <algorithm> |
22 |
|
23 |
#if HAVE_IOSTREAM |
24 |
#include <iostream> |
25 |
#elif HAVE_IOSTREAM_H |
26 |
#include <iostream.h> |
27 |
#endif |
28 |
|
29 |
#if HAVE_FSTREAM |
30 |
#include <fstream> |
31 |
#elif HAVE_FSTREAM_H |
32 |
#include <fstream.h> |
33 |
#endif |
34 |
#include <vector> |
35 |
|
36 |
#include "fingerprint.hpp" |
37 |
#include "oberror.hpp" |
38 |
|
39 |
using namespace std; |
40 |
namespace OpenBabel { |
41 |
|
42 |
OBFingerprint* OBFingerprint::_pDefault; //static variable |
43 |
const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int); |
44 |
|
45 |
void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n) |
46 |
{ |
47 |
vec[n/bitsperint] |= (1 << (n % bitsperint)); |
48 |
} |
49 |
|
50 |
//////////////////////////////////////// |
51 |
void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits) |
52 |
{ |
53 |
while(vec.size()*bitsperint/2 >= nbits) |
54 |
vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2, |
55 |
vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end()); |
56 |
} |
57 |
|
58 |
//////////////////////////////////////// |
59 |
bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt) |
60 |
{ |
61 |
Fptpos iter; |
62 |
if(id.empty()) |
63 |
iter=FPtsMap().begin(); |
64 |
else |
65 |
{ |
66 |
iter=FPtsMap().find(id); |
67 |
if(iter!=FPtsMap().end()) |
68 |
++iter; |
69 |
} |
70 |
if(iter==FPtsMap().end()) |
71 |
return false; |
72 |
id = iter->first; |
73 |
pFPrt = iter->second; |
74 |
return true; |
75 |
} |
76 |
|
77 |
OBFingerprint* OBFingerprint::FindFingerprint(string& ID) |
78 |
{ |
79 |
if(ID.empty()) |
80 |
return _pDefault; |
81 |
Fptpos iter = FPtsMap().find(ID); |
82 |
if(iter==FPtsMap().end()) |
83 |
return NULL; |
84 |
else |
85 |
return iter->second; |
86 |
} |
87 |
|
88 |
double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2) |
89 |
{ |
90 |
//Independent of sizeof(unsigned int) |
91 |
if(vec1.size()!=vec2.size()) |
92 |
return -1; //different number of bits |
93 |
int andbits=0, orbits=0; |
94 |
for (unsigned i=0;i<vec1.size();++i) |
95 |
{ |
96 |
int andfp = vec1[i] & vec2[i]; |
97 |
int orfp = vec1[i] | vec2[i]; |
98 |
//Count bits |
99 |
for(;andfp;andfp=andfp<<1) |
100 |
if(andfp<0) ++andbits; |
101 |
for(;orfp;orfp=orfp<<1) |
102 |
if(orfp<0) ++orbits; |
103 |
} |
104 |
return((double)andbits/(double)orbits); |
105 |
} |
106 |
|
107 |
//*****************************************************************8 |
108 |
/*! |
109 |
|
110 |
*/ |
111 |
bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions, |
112 |
unsigned int MaxCandidates) |
113 |
{ |
114 |
///Finds chemical objects in datafilename (which must previously have been indexed) |
115 |
///that have all the same bits set in their fingerprints as in the fingerprint of |
116 |
///a pattern object. (Usually a substructure search in molecules.) |
117 |
///The type of fingerprint and its degree of folding does not have to be specified |
118 |
///here because the values in the index file are used. |
119 |
///The positions of the candidate matching molecules in the original datafile are returned. |
120 |
|
121 |
vector<unsigned int> vecwords; |
122 |
_pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::bitsperint); |
123 |
|
124 |
vector<unsigned int>candidates; //indices of matches from fingerprint screen |
125 |
candidates.reserve(MaxCandidates); |
126 |
|
127 |
unsigned int dataSize = _index.header.nEntries; |
128 |
// GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype); |
129 |
|
130 |
unsigned int words = _index.header.words; |
131 |
unsigned int* nextp = &_index.fptdata[0]; |
132 |
unsigned int* ppat0 = &vecwords[0]; |
133 |
register unsigned int* p; |
134 |
register unsigned int* ppat; |
135 |
register unsigned int a; |
136 |
unsigned int i; // need address of this, can't be register |
137 |
for(i=0;i<dataSize; i++) //speed critical section |
138 |
{ |
139 |
p=nextp; |
140 |
nextp += words; |
141 |
ppat=ppat0; |
142 |
a=0; |
143 |
while(p<nextp) |
144 |
{ |
145 |
if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break; |
146 |
} |
147 |
if(!a) |
148 |
{ |
149 |
candidates.push_back(i); |
150 |
if(candidates.size()>=MaxCandidates) |
151 |
break; |
152 |
} |
153 |
} |
154 |
|
155 |
if(i<_index.header.nEntries) //premature end to search |
156 |
{ |
157 |
#ifdef HAVE_SSTREAM |
158 |
stringstream errorMsg; |
159 |
#else |
160 |
strstream errorMsg; |
161 |
#endif |
162 |
errorMsg << "Stopped looking after " << i << " molecules." << endl; |
163 |
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obInfo); |
164 |
} |
165 |
|
166 |
vector<unsigned int>::iterator itr; |
167 |
for(itr=candidates.begin();itr!=candidates.end();itr++) |
168 |
{ |
169 |
SeekPositions.push_back(_index.seekdata[*itr]); |
170 |
} |
171 |
return true; |
172 |
} |
173 |
|
174 |
///////////////////////////////////////////////////////// |
175 |
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
176 |
double MinTani) |
177 |
{ |
178 |
vector<unsigned int> targetfp; |
179 |
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint); |
180 |
|
181 |
unsigned int words = _index.header.words; |
182 |
unsigned int dataSize = _index.header.nEntries; |
183 |
unsigned int* nextp = &_index.fptdata[0]; |
184 |
register unsigned int* p; |
185 |
register unsigned int i; |
186 |
for(i=0;i<dataSize; i++) //speed critical section |
187 |
{ |
188 |
p=nextp; |
189 |
nextp += words; |
190 |
double tani = OBFingerprint::Tanimoto(targetfp,p); |
191 |
if(tani>MinTani) |
192 |
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
193 |
} |
194 |
return true; |
195 |
} |
196 |
|
197 |
///////////////////////////////////////////////////////// |
198 |
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
199 |
int nCandidates) |
200 |
{ |
201 |
///If nCandidates is zero or omitted the original size of the multimap is used |
202 |
if(nCandidates) |
203 |
{ |
204 |
//initialise the multimap with nCandidate zero entries |
205 |
SeekposMap.clear(); |
206 |
int i; |
207 |
for(i=0;i<nCandidates;++i) |
208 |
SeekposMap.insert(pair<const double, unsigned int>(0,0)); |
209 |
} |
210 |
else if(SeekposMap.size()==0) |
211 |
return false; |
212 |
|
213 |
vector<unsigned int> targetfp; |
214 |
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint); |
215 |
|
216 |
unsigned int words = _index.header.words; |
217 |
unsigned int dataSize = _index.header.nEntries; |
218 |
unsigned int* nextp = &_index.fptdata[0]; |
219 |
register unsigned int* p; |
220 |
register unsigned int i; |
221 |
for(i=0;i<dataSize; i++) //speed critical section |
222 |
{ |
223 |
p=nextp; |
224 |
nextp += words; |
225 |
double tani = OBFingerprint::Tanimoto(targetfp,p); |
226 |
if(tani>SeekposMap.begin()->first) |
227 |
{ |
228 |
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
229 |
SeekposMap.erase(SeekposMap.begin()); |
230 |
} |
231 |
} |
232 |
return true; |
233 |
}///////////////////////////////////////////////////////// |
234 |
string FastSearch::ReadIndex(istream* pIndexstream) |
235 |
{ |
236 |
//Reads fs index from istream into member variables |
237 |
// but first checks whether it is already loaded |
238 |
FptIndexHeader headercopy = _index.header; |
239 |
pIndexstream->read((char*)&(_index.header), sizeof(FptIndexHeader)); |
240 |
|
241 |
if(memcmp(&headercopy,&(_index.header),sizeof(FptIndexHeader))) |
242 |
{ |
243 |
pIndexstream->seekg(_index.header.headerlength);//allows header length to be changed |
244 |
|
245 |
unsigned int nwords = _index.header.nEntries * _index.header.words; |
246 |
_index.fptdata.resize(nwords); |
247 |
_index.seekdata.resize(_index.header.nEntries); |
248 |
|
249 |
pIndexstream->read((char*)&(_index.fptdata[0]), sizeof(unsigned int) * nwords); |
250 |
pIndexstream->read((char*)&(_index.seekdata[0]), sizeof(unsigned int) * _index.header.nEntries); |
251 |
|
252 |
if(pIndexstream->fail()) |
253 |
*(_index.header.datafilename) = '\0'; |
254 |
|
255 |
string tempFP(_index.header.fpid); |
256 |
_pFP = OBFingerprint::FindFingerprint(tempFP); |
257 |
if(!_pFP) |
258 |
{ |
259 |
#ifdef HAVE_SSTREAM |
260 |
stringstream errorMsg; |
261 |
#else |
262 |
strstream errorMsg; |
263 |
#endif |
264 |
errorMsg << "Index has Fingerprints of type '" << _index.header.fpid |
265 |
<< " which is not currently loaded." << endl; |
266 |
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning); |
267 |
*(_index.header.datafilename) = '\0'; |
268 |
} |
269 |
|
270 |
} |
271 |
return _index.header.datafilename; |
272 |
} |
273 |
|
274 |
//******************************************************* |
275 |
FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os, |
276 |
std::string& fpid, int FptBits) |
277 |
{ |
278 |
///Starts indexing process |
279 |
_indexstream = os; |
280 |
_pFP = OBFingerprint::FindFingerprint(fpid); |
281 |
if(!_pFP) |
282 |
{ |
283 |
#ifdef HAVE_SSTREAM |
284 |
stringstream errorMsg; |
285 |
#else |
286 |
strstream errorMsg; |
287 |
#endif |
288 |
errorMsg << "Fingerprint type '" << fpid << "' not available" << endl; |
289 |
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning); |
290 |
} |
291 |
|
292 |
_nbits=FptBits; |
293 |
_pindex= new FptIndex; |
294 |
_pindex->header.headerlength = sizeof(FptIndexHeader); |
295 |
strncpy(_pindex->header.fpid,fpid.c_str(),15); |
296 |
strncpy(_pindex->header.datafilename, datafilename.c_str(), 255); |
297 |
} |
298 |
|
299 |
///////////////////////////////////////////////////////////// |
300 |
FastSearchIndexer::~FastSearchIndexer() |
301 |
{ |
302 |
///Saves index file |
303 |
_pindex->header.nEntries = _pindex->seekdata.size(); |
304 |
_indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader)); |
305 |
_indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int)); |
306 |
_indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int)); |
307 |
if(!_indexstream) |
308 |
obErrorLog.ThrowError(__FUNCTION__, |
309 |
"Difficulty writing index", obWarning); |
310 |
delete _pindex; |
311 |
} |
312 |
|
313 |
/////////////////////////////////////////////////////////////// |
314 |
bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos) |
315 |
{ |
316 |
///Adds a fingerprint |
317 |
|
318 |
vector<unsigned int> vecwords; |
319 |
if(!_pFP) |
320 |
return false; |
321 |
if(_pFP->GetFingerprint(pOb, vecwords, _nbits)) |
322 |
{ |
323 |
_pindex->header.words = vecwords.size(); //Use size as returned from fingerprint |
324 |
for(unsigned int i=0;i<_pindex->header.words;i++) |
325 |
_pindex->fptdata.push_back(vecwords[i]); |
326 |
_pindex->seekdata.push_back(seekpos); |
327 |
return true; |
328 |
} |
329 |
obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning); |
330 |
return false; |
331 |
|
332 |
} |
333 |
|
334 |
/*! |
335 |
\class OBFingerprint |
336 |
These fingerprints are condensed representation of molecules (or other objects) |
337 |
as a list of boolean values (actually bits in a vector<unsigned>) with length |
338 |
of a power of 2. The main motivation is for fast searching of data sources |
339 |
containing large numbers of molecules (up to several million). OpenBabel |
340 |
provides some routines which can search text files containing lists of molecules |
341 |
in any format. See the documentation on the class FastSearch. |
342 |
|
343 |
There are descriptions of molecular fingerprints at <br> |
344 |
http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br> |
345 |
http://www.mesaac.com/Fingerprint.htm <br> |
346 |
Many methods of preparing fingerprints have been described, but the type supported |
347 |
currently in OpenBabel has each bit representing a substructure (or other |
348 |
molecular property). If a substructure is present in the molecule, then a |
349 |
particular bit is set to 1. But because the hashing method may also map other |
350 |
substructures to the same bit, a match does not guarantee that a particular |
351 |
substructure is present; there may be false positives. However, with proper design, |
352 |
a large fraction of irrelevant molecules in a data set can be eliminated in a |
353 |
fast search with boolean methods on the fingerprints. |
354 |
It then becomes feasible to make a definitive substructure search by |
355 |
conventional methods on this reduced list even if it is slow. |
356 |
|
357 |
OpenBabel provides a framework for applying new types of fingerprints without |
358 |
changing any existing code. They are derived from OBFingerprint and the |
359 |
source file is just compiled with the rest of OpenBabel. Alternatively, |
360 |
they can be separately compiled as a DLL or shared library and discovered |
361 |
when OpenBabel runs. |
362 |
|
363 |
Fingerprints derived from this abstract base class OBFingerprint can be for any |
364 |
object derived from OBBase (not just for OBMol). |
365 |
Each derived class provides an ID as a string and OBFingerprint keeps a map of |
366 |
these to provides a pointer to the class when requested in FindFingerprint. |
367 |
|
368 |
<h4>-- To define a fingerprint type --</h4> |
369 |
The classes derived form OBFingerprint are required to provide |
370 |
a GetFingerprint() routine and a Description() routine |
371 |
\code |
372 |
class MyFpType : OBFingerprint |
373 |
{ |
374 |
MyFpType(const char* id) : OBFingerprint(id){}; |
375 |
virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits) |
376 |
{ |
377 |
//Convert pOb to the required type, usually OBMol |
378 |
OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
379 |
fp.resize(required_number_of_words); |
380 |
... |
381 |
use SetBit(fp,n); to set the nth bit |
382 |
|
383 |
if(nbits) |
384 |
Fold(fp, nbits); |
385 |
} |
386 |
virtual const char* Description(){ return "Some descriptive text";} |
387 |
... |
388 |
}; |
389 |
\endcode |
390 |
//Declare a global instance with the ID you will use in -f options to specify its use. |
391 |
\code |
392 |
MyFpType theMyFpType("myfpID"); |
393 |
\endcode |
394 |
|
395 |
<h4>-- To obtain a fingerprint --</h4> |
396 |
\code |
397 |
OBMol mol; |
398 |
... |
399 |
vector<unsigned int> fp; |
400 |
OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint |
401 |
\endcode |
402 |
or |
403 |
\code |
404 |
vector<unsigned int> fp; |
405 |
OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID"); |
406 |
...and maybe... |
407 |
pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger |
408 |
\endcode |
409 |
|
410 |
<h4>-- To print a list of available fingerprint types --</h4> |
411 |
\code |
412 |
std::string id; |
413 |
OBFingerPrint* pFPrt=NULL; |
414 |
while(OBFingerprint::GetNextFPrt(id, pFPrt)) |
415 |
{ |
416 |
cout << id << " -- " << pFPrt->Description() << endl; |
417 |
} |
418 |
\endcode |
419 |
|
420 |
|
421 |
Fingerprints are handled as vector<unsigned int> so that the number of bits |
422 |
in this vector and their order will be platform and compiler |
423 |
dependent, because of size of int and endian differences. Use fingerprints |
424 |
(and fastsearch indexes containing them) only for comparing with other |
425 |
fingerprints prepared on the same machine. |
426 |
|
427 |
The FingerprintFormat class is an output format which displays fingerprints |
428 |
as hexadecimal. When multiple molecules are supplied it will calculate the |
429 |
Tanimoto coefficient from the first molecule to each of the others. It also |
430 |
shows whether the first molecule is a possible substructure to all the others, |
431 |
i.e. whether all the bits set in the fingerprint for the first molecule are |
432 |
set in the fingerprint of the others. To display hexadecimal information when |
433 |
multiple molecules are provided it is necessay to use the -xh option. |
434 |
|
435 |
To see a list of available format types, type babel -F on the command line. |
436 |
The -xF option of the FingerprintFormat class also provides this output, but due |
437 |
to a quirk in the way the program works, it is necessary to have a valid input |
438 |
molecule for this option to work. |
439 |
*/ |
440 |
|
441 |
/*! \class FastSearch |
442 |
The FastSearch class searches an index to a datafile containing a list of molecules |
443 |
(or other objects) prepared by FastSearchIndexer. |
444 |
|
445 |
OpenBabel can also search files for molecules containing a substructure specified |
446 |
by a SMARTS string, using OBSmartsPattern or from the command line: |
447 |
\code |
448 |
babel datafile.xxx -outfile.yyy -sSMARTS |
449 |
\endcode |
450 |
But when the data file contains more than about 10,000 molecules this becomes |
451 |
rather too slow to be used interactively. To do it more quickly, an index |
452 |
of the molecules containing their fingerprints (see OBFingerprint) is prepared using |
453 |
FastSearchIndexer. The indexing process may take a long time but only has to |
454 |
be done once. The index can be searched very quickly with FastSearch. Because |
455 |
of the nature of fingerprints a match to a bit does not guarantee |
456 |
the presence of a particular substructure or other molecular property, so that |
457 |
a definitive answer may require a subsequent search of the (much reduced) list |
458 |
of candidates by another method (like OBSmartsPattern). |
459 |
|
460 |
Note that the index files are not portable. They need to be prepared on the |
461 |
computer that will acess them. |
462 |
|
463 |
<h4>Using FastSearch and FastSearchIndexer in a program</h4> |
464 |
|
465 |
The index has two tables: |
466 |
- an array of fingerprints of the molecules |
467 |
- an array of the seek positions in the datasource file of all the molecules |
468 |
|
469 |
<h4>To prepare an fastsearch index file:</h4> |
470 |
- Open an ostream to the index file. |
471 |
- Make a FastSearchIndexer object on the heap or the stack, passing in as parameters: |
472 |
- the datafile name, the indexfile stream, |
473 |
- the id of the fingerprint type to be used, |
474 |
- the number of bits the fingerprint is to be folded down to, If it is to be left |
475 |
unfolded, set fpid to 0 or do not specify it. |
476 |
. |
477 |
- For each molecule, call Add() with its pointer and its position in the datafile.<br> |
478 |
Currently the std::streampos value is implicitly cast to unsigned int so that |
479 |
for 32bit machines the datafile has to be no longer than about 2Gbyte. |
480 |
- The index file is written when the FastSearchIndexer object is deleted or goes out |
481 |
of scope. |
482 |
|
483 |
<h4>To search in a fastsearch index file</h4> |
484 |
|
485 |
- Open a std::istream to the indexfile (in binary mode on some systems) |
486 |
- Make a FastSearch object, read the index and open the datafile whose |
487 |
name it provides |
488 |
\code |
489 |
ifstream ifs(indexname,ios::binary); |
490 |
FastSearch fs; |
491 |
string datafilename = fs.ReadIndex(&ifs); |
492 |
if(datafilename.empty()/endcode |
493 |
return false; |
494 |
ifstream datastream(datafilename); |
495 |
if(!datastream) |
496 |
return false; |
497 |
\endcode |
498 |
|
499 |
<strong>To do a search for molecules which have all the substructure bits the |
500 |
OBMol object, patternMol</strong> |
501 |
\code |
502 |
vector<unsigned int>& SeekPositions; |
503 |
if(!fs.Find(patternMol, SeekPositions, MaxCandidates)) |
504 |
|
505 |
for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++) |
506 |
{ |
507 |
datastream.seekg(*itr); |
508 |
... read the candidate molecule |
509 |
and subject to more rigorous test if necessary |
510 |
} |
511 |
\endcode |
512 |
|
513 |
<strong>To do a similarity search based on the Tanimoto coefficient</strong> |
514 |
This is defined as: <br> |
515 |
<em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br> |
516 |
where the boolean operations between the fingerprints are bitwise<br> |
517 |
The Tanimoto coefficient has no absolute meaning and depends on |
518 |
the design of the fingerprint. |
519 |
\code |
520 |
multimap<double, unsigned int> SeekposMap; |
521 |
// to find n best molecules |
522 |
fs.FindSimilar(&patternMol, SeekposMap, n); |
523 |
...or |
524 |
// to find molecules with Tanimoto coefficient > MinTani |
525 |
fs.FindSimilar(&patternMol, SeekposMap, MinTani); |
526 |
|
527 |
multimap<double, unsigned int>::reverse_iterator itr; |
528 |
for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) |
529 |
{ |
530 |
datastream.seekg(itr->second); |
531 |
... read the candidate molecule |
532 |
double tani = itr->first; |
533 |
} |
534 |
\endcode |
535 |
|
536 |
The FastSearchFormat class facilitates the use of these routine from the |
537 |
command line or other front end program. For instance: |
538 |
|
539 |
<strong>Prepare an index:</strong> |
540 |
\code |
541 |
babel datafile.xxx index.fs |
542 |
\endcode |
543 |
or |
544 |
\code |
545 |
babel datafile.xxx -ofs namedindex |
546 |
\endcode |
547 |
With options you can specify: |
548 |
- which type of fingerprint to be used, e.g. -xfFP2, |
549 |
- whether it is folded to a specified number of bits, e.g. -xn128 |
550 |
(which should be a power of2) |
551 |
- whether to pre-select the molecules which are indexed: |
552 |
- by structure e.g only ethers and esters, -sCOC |
553 |
- by excluding molecules with bezene rings, -vc1ccccc1 |
554 |
- by position in the datafile e.g. only mols 10 to 90, -f10 -l90 |
555 |
. |
556 |
<strong>Substructure search</strong> in a previously prepared index file |
557 |
\code |
558 |
babel index.fs outfile.yyy -sSMILES |
559 |
\endcode |
560 |
The datafile can also be used as the input file, provided the input format is specified as fs |
561 |
\code |
562 |
babel datafile.xxx outfile.yyy -sSMILES -ifs |
563 |
\endcode |
564 |
A "slow" search not using fingerprints would be done on the same data by omitting -ifs. |
565 |
A "slow" search can use SMARTS strings, but the fast search is restricted |
566 |
to the subset which is valid SMILES. |
567 |
|
568 |
With the -S option, the target can be specified as a molecule in a file of any format |
569 |
\code |
570 |
babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs |
571 |
\endcode |
572 |
These searches have two stages: a fingerprint search which produces |
573 |
a number of candidate molecules and a definitive search which selects |
574 |
from these using SMARTS pattern matching. The maximum number of candidates |
575 |
is 4000 by default but you can change this with an option |
576 |
e.g. -al 8000 (Note that you need the space between l and the number.) |
577 |
If the fingerprint search reaches the maximum number it will not |
578 |
look further and will tell you at what stage it stopped. |
579 |
|
580 |
<strong>Similarity search</strong> in a previously prepared index file<br> |
581 |
This rather done (rather than a substructure search) if the -at option is used, |
582 |
\code |
583 |
babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7 |
584 |
\endcode |
585 |
for instance |
586 |
- -at0.7 will recover all molecules with Tanimoto greater than 0.7 |
587 |
- -at15 (no decimal point) will recover the 15 molecules with largest coefficients. |
588 |
- -aa will add the Tanimoto coefficient to the titles of the output molecules. |
589 |
|
590 |
All stages, the indexing, the interpretation of the SMILES string in the -s option, |
591 |
the file in the -S option and the final SMARTS filter convert to OBMol and apply |
592 |
ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O |
593 |
are recognized as chemically identical. |
594 |
*/ |
595 |
|
596 |
}//Openbabel |
597 |
|
598 |
//! \file fingerprint.cpp |
599 |
//! \brief Definitions for OBFingerprint base class and fastsearch classes |