ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/group/trunk/OOPSE-3.0/src/openbabel/fingerprint.cpp
Revision: 2450
Committed: Thu Nov 17 20:38:45 2005 UTC (18 years, 8 months ago) by gezelter
File size: 20973 byte(s)
Log Message:
Unifying config.h stuff and making sure the OpenBabel codes can find
our default (and environment variable) Force Field directories.

File Contents

# Content
1 /**********************************************************************
2 fingerprint.cpp - Implementation of fingerpring base class and fastsearching
3
4 Copyright (C) 2005 by Chris Morley
5
6 This file is part of the Open Babel project.
7 For more information, see <http://openbabel.sourceforge.net/>
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation version 2 of the License.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17 ***********************************************************************/
18
19 #include "config.h"
20 #include <vector>
21 #include <algorithm>
22
23 #if HAVE_IOSTREAM
24 #include <iostream>
25 #elif HAVE_IOSTREAM_H
26 #include <iostream.h>
27 #endif
28
29 #if HAVE_FSTREAM
30 #include <fstream>
31 #elif HAVE_FSTREAM_H
32 #include <fstream.h>
33 #endif
34 #include <vector>
35
36 #include "fingerprint.hpp"
37 #include "oberror.hpp"
38
39 using namespace std;
40 namespace OpenBabel {
41
42 OBFingerprint* OBFingerprint::_pDefault; //static variable
43 const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int);
44
45 void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n)
46 {
47 vec[n/bitsperint] |= (1 << (n % bitsperint));
48 }
49
50 ////////////////////////////////////////
51 void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits)
52 {
53 while(vec.size()*bitsperint/2 >= nbits)
54 vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2,
55 vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end());
56 }
57
58 ////////////////////////////////////////
59 bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt)
60 {
61 Fptpos iter;
62 if(id.empty())
63 iter=FPtsMap().begin();
64 else
65 {
66 iter=FPtsMap().find(id);
67 if(iter!=FPtsMap().end())
68 ++iter;
69 }
70 if(iter==FPtsMap().end())
71 return false;
72 id = iter->first;
73 pFPrt = iter->second;
74 return true;
75 }
76
77 OBFingerprint* OBFingerprint::FindFingerprint(string& ID)
78 {
79 if(ID.empty())
80 return _pDefault;
81 Fptpos iter = FPtsMap().find(ID);
82 if(iter==FPtsMap().end())
83 return NULL;
84 else
85 return iter->second;
86 }
87
88 double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2)
89 {
90 //Independent of sizeof(unsigned int)
91 if(vec1.size()!=vec2.size())
92 return -1; //different number of bits
93 int andbits=0, orbits=0;
94 for (unsigned i=0;i<vec1.size();++i)
95 {
96 int andfp = vec1[i] & vec2[i];
97 int orfp = vec1[i] | vec2[i];
98 //Count bits
99 for(;andfp;andfp=andfp<<1)
100 if(andfp<0) ++andbits;
101 for(;orfp;orfp=orfp<<1)
102 if(orfp<0) ++orbits;
103 }
104 return((double)andbits/(double)orbits);
105 }
106
107 //*****************************************************************8
108 /*!
109
110 */
111 bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions,
112 unsigned int MaxCandidates)
113 {
114 ///Finds chemical objects in datafilename (which must previously have been indexed)
115 ///that have all the same bits set in their fingerprints as in the fingerprint of
116 ///a pattern object. (Usually a substructure search in molecules.)
117 ///The type of fingerprint and its degree of folding does not have to be specified
118 ///here because the values in the index file are used.
119 ///The positions of the candidate matching molecules in the original datafile are returned.
120
121 vector<unsigned int> vecwords;
122 _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::bitsperint);
123
124 vector<unsigned int>candidates; //indices of matches from fingerprint screen
125 candidates.reserve(MaxCandidates);
126
127 unsigned int dataSize = _index.header.nEntries;
128 // GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype);
129
130 unsigned int words = _index.header.words;
131 unsigned int* nextp = &_index.fptdata[0];
132 unsigned int* ppat0 = &vecwords[0];
133 register unsigned int* p;
134 register unsigned int* ppat;
135 register unsigned int a;
136 unsigned int i; // need address of this, can't be register
137 for(i=0;i<dataSize; i++) //speed critical section
138 {
139 p=nextp;
140 nextp += words;
141 ppat=ppat0;
142 a=0;
143 while(p<nextp)
144 {
145 if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break;
146 }
147 if(!a)
148 {
149 candidates.push_back(i);
150 if(candidates.size()>=MaxCandidates)
151 break;
152 }
153 }
154
155 if(i<_index.header.nEntries) //premature end to search
156 {
157 #ifdef HAVE_SSTREAM
158 stringstream errorMsg;
159 #else
160 strstream errorMsg;
161 #endif
162 errorMsg << "Stopped looking after " << i << " molecules." << endl;
163 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obInfo);
164 }
165
166 vector<unsigned int>::iterator itr;
167 for(itr=candidates.begin();itr!=candidates.end();itr++)
168 {
169 SeekPositions.push_back(_index.seekdata[*itr]);
170 }
171 return true;
172 }
173
174 /////////////////////////////////////////////////////////
175 bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap,
176 double MinTani)
177 {
178 vector<unsigned int> targetfp;
179 _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint);
180
181 unsigned int words = _index.header.words;
182 unsigned int dataSize = _index.header.nEntries;
183 unsigned int* nextp = &_index.fptdata[0];
184 register unsigned int* p;
185 register unsigned int i;
186 for(i=0;i<dataSize; i++) //speed critical section
187 {
188 p=nextp;
189 nextp += words;
190 double tani = OBFingerprint::Tanimoto(targetfp,p);
191 if(tani>MinTani)
192 SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i]));
193 }
194 return true;
195 }
196
197 /////////////////////////////////////////////////////////
198 bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap,
199 int nCandidates)
200 {
201 ///If nCandidates is zero or omitted the original size of the multimap is used
202 if(nCandidates)
203 {
204 //initialise the multimap with nCandidate zero entries
205 SeekposMap.clear();
206 int i;
207 for(i=0;i<nCandidates;++i)
208 SeekposMap.insert(pair<const double, unsigned int>(0,0));
209 }
210 else if(SeekposMap.size()==0)
211 return false;
212
213 vector<unsigned int> targetfp;
214 _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint);
215
216 unsigned int words = _index.header.words;
217 unsigned int dataSize = _index.header.nEntries;
218 unsigned int* nextp = &_index.fptdata[0];
219 register unsigned int* p;
220 register unsigned int i;
221 for(i=0;i<dataSize; i++) //speed critical section
222 {
223 p=nextp;
224 nextp += words;
225 double tani = OBFingerprint::Tanimoto(targetfp,p);
226 if(tani>SeekposMap.begin()->first)
227 {
228 SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i]));
229 SeekposMap.erase(SeekposMap.begin());
230 }
231 }
232 return true;
233 }/////////////////////////////////////////////////////////
234 string FastSearch::ReadIndex(istream* pIndexstream)
235 {
236 //Reads fs index from istream into member variables
237 // but first checks whether it is already loaded
238 FptIndexHeader headercopy = _index.header;
239 pIndexstream->read((char*)&(_index.header), sizeof(FptIndexHeader));
240
241 if(memcmp(&headercopy,&(_index.header),sizeof(FptIndexHeader)))
242 {
243 pIndexstream->seekg(_index.header.headerlength);//allows header length to be changed
244
245 unsigned int nwords = _index.header.nEntries * _index.header.words;
246 _index.fptdata.resize(nwords);
247 _index.seekdata.resize(_index.header.nEntries);
248
249 pIndexstream->read((char*)&(_index.fptdata[0]), sizeof(unsigned int) * nwords);
250 pIndexstream->read((char*)&(_index.seekdata[0]), sizeof(unsigned int) * _index.header.nEntries);
251
252 if(pIndexstream->fail())
253 *(_index.header.datafilename) = '\0';
254
255 string tempFP(_index.header.fpid);
256 _pFP = OBFingerprint::FindFingerprint(tempFP);
257 if(!_pFP)
258 {
259 #ifdef HAVE_SSTREAM
260 stringstream errorMsg;
261 #else
262 strstream errorMsg;
263 #endif
264 errorMsg << "Index has Fingerprints of type '" << _index.header.fpid
265 << " which is not currently loaded." << endl;
266 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning);
267 *(_index.header.datafilename) = '\0';
268 }
269
270 }
271 return _index.header.datafilename;
272 }
273
274 //*******************************************************
275 FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os,
276 std::string& fpid, int FptBits)
277 {
278 ///Starts indexing process
279 _indexstream = os;
280 _pFP = OBFingerprint::FindFingerprint(fpid);
281 if(!_pFP)
282 {
283 #ifdef HAVE_SSTREAM
284 stringstream errorMsg;
285 #else
286 strstream errorMsg;
287 #endif
288 errorMsg << "Fingerprint type '" << fpid << "' not available" << endl;
289 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning);
290 }
291
292 _nbits=FptBits;
293 _pindex= new FptIndex;
294 _pindex->header.headerlength = sizeof(FptIndexHeader);
295 strncpy(_pindex->header.fpid,fpid.c_str(),15);
296 strncpy(_pindex->header.datafilename, datafilename.c_str(), 255);
297 }
298
299 /////////////////////////////////////////////////////////////
300 FastSearchIndexer::~FastSearchIndexer()
301 {
302 ///Saves index file
303 _pindex->header.nEntries = _pindex->seekdata.size();
304 _indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader));
305 _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int));
306 _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int));
307 if(!_indexstream)
308 obErrorLog.ThrowError(__FUNCTION__,
309 "Difficulty writing index", obWarning);
310 delete _pindex;
311 }
312
313 ///////////////////////////////////////////////////////////////
314 bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos)
315 {
316 ///Adds a fingerprint
317
318 vector<unsigned int> vecwords;
319 if(!_pFP)
320 return false;
321 if(_pFP->GetFingerprint(pOb, vecwords, _nbits))
322 {
323 _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint
324 for(unsigned int i=0;i<_pindex->header.words;i++)
325 _pindex->fptdata.push_back(vecwords[i]);
326 _pindex->seekdata.push_back(seekpos);
327 return true;
328 }
329 obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning);
330 return false;
331
332 }
333
334 /*!
335 \class OBFingerprint
336 These fingerprints are condensed representation of molecules (or other objects)
337 as a list of boolean values (actually bits in a vector<unsigned>) with length
338 of a power of 2. The main motivation is for fast searching of data sources
339 containing large numbers of molecules (up to several million). OpenBabel
340 provides some routines which can search text files containing lists of molecules
341 in any format. See the documentation on the class FastSearch.
342
343 There are descriptions of molecular fingerprints at <br>
344 http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br>
345 http://www.mesaac.com/Fingerprint.htm <br>
346 Many methods of preparing fingerprints have been described, but the type supported
347 currently in OpenBabel has each bit representing a substructure (or other
348 molecular property). If a substructure is present in the molecule, then a
349 particular bit is set to 1. But because the hashing method may also map other
350 substructures to the same bit, a match does not guarantee that a particular
351 substructure is present; there may be false positives. However, with proper design,
352 a large fraction of irrelevant molecules in a data set can be eliminated in a
353 fast search with boolean methods on the fingerprints.
354 It then becomes feasible to make a definitive substructure search by
355 conventional methods on this reduced list even if it is slow.
356
357 OpenBabel provides a framework for applying new types of fingerprints without
358 changing any existing code. They are derived from OBFingerprint and the
359 source file is just compiled with the rest of OpenBabel. Alternatively,
360 they can be separately compiled as a DLL or shared library and discovered
361 when OpenBabel runs.
362
363 Fingerprints derived from this abstract base class OBFingerprint can be for any
364 object derived from OBBase (not just for OBMol).
365 Each derived class provides an ID as a string and OBFingerprint keeps a map of
366 these to provides a pointer to the class when requested in FindFingerprint.
367
368 <h4>-- To define a fingerprint type --</h4>
369 The classes derived form OBFingerprint are required to provide
370 a GetFingerprint() routine and a Description() routine
371 \code
372 class MyFpType : OBFingerprint
373 {
374 MyFpType(const char* id) : OBFingerprint(id){};
375 virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits)
376 {
377 //Convert pOb to the required type, usually OBMol
378 OBMol* pmol = dynamic_cast<OBMol*>(pOb);
379 fp.resize(required_number_of_words);
380 ...
381 use SetBit(fp,n); to set the nth bit
382
383 if(nbits)
384 Fold(fp, nbits);
385 }
386 virtual const char* Description(){ return "Some descriptive text";}
387 ...
388 };
389 \endcode
390 //Declare a global instance with the ID you will use in -f options to specify its use.
391 \code
392 MyFpType theMyFpType("myfpID");
393 \endcode
394
395 <h4>-- To obtain a fingerprint --</h4>
396 \code
397 OBMol mol;
398 ...
399 vector<unsigned int> fp;
400 OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint
401 \endcode
402 or
403 \code
404 vector<unsigned int> fp;
405 OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID");
406 ...and maybe...
407 pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger
408 \endcode
409
410 <h4>-- To print a list of available fingerprint types --</h4>
411 \code
412 std::string id;
413 OBFingerPrint* pFPrt=NULL;
414 while(OBFingerprint::GetNextFPrt(id, pFPrt))
415 {
416 cout << id << " -- " << pFPrt->Description() << endl;
417 }
418 \endcode
419
420
421 Fingerprints are handled as vector<unsigned int> so that the number of bits
422 in this vector and their order will be platform and compiler
423 dependent, because of size of int and endian differences. Use fingerprints
424 (and fastsearch indexes containing them) only for comparing with other
425 fingerprints prepared on the same machine.
426
427 The FingerprintFormat class is an output format which displays fingerprints
428 as hexadecimal. When multiple molecules are supplied it will calculate the
429 Tanimoto coefficient from the first molecule to each of the others. It also
430 shows whether the first molecule is a possible substructure to all the others,
431 i.e. whether all the bits set in the fingerprint for the first molecule are
432 set in the fingerprint of the others. To display hexadecimal information when
433 multiple molecules are provided it is necessay to use the -xh option.
434
435 To see a list of available format types, type babel -F on the command line.
436 The -xF option of the FingerprintFormat class also provides this output, but due
437 to a quirk in the way the program works, it is necessary to have a valid input
438 molecule for this option to work.
439 */
440
441 /*! \class FastSearch
442 The FastSearch class searches an index to a datafile containing a list of molecules
443 (or other objects) prepared by FastSearchIndexer.
444
445 OpenBabel can also search files for molecules containing a substructure specified
446 by a SMARTS string, using OBSmartsPattern or from the command line:
447 \code
448 babel datafile.xxx -outfile.yyy -sSMARTS
449 \endcode
450 But when the data file contains more than about 10,000 molecules this becomes
451 rather too slow to be used interactively. To do it more quickly, an index
452 of the molecules containing their fingerprints (see OBFingerprint) is prepared using
453 FastSearchIndexer. The indexing process may take a long time but only has to
454 be done once. The index can be searched very quickly with FastSearch. Because
455 of the nature of fingerprints a match to a bit does not guarantee
456 the presence of a particular substructure or other molecular property, so that
457 a definitive answer may require a subsequent search of the (much reduced) list
458 of candidates by another method (like OBSmartsPattern).
459
460 Note that the index files are not portable. They need to be prepared on the
461 computer that will acess them.
462
463 <h4>Using FastSearch and FastSearchIndexer in a program</h4>
464
465 The index has two tables:
466 - an array of fingerprints of the molecules
467 - an array of the seek positions in the datasource file of all the molecules
468
469 <h4>To prepare an fastsearch index file:</h4>
470 - Open an ostream to the index file.
471 - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters:
472 - the datafile name, the indexfile stream,
473 - the id of the fingerprint type to be used,
474 - the number of bits the fingerprint is to be folded down to, If it is to be left
475 unfolded, set fpid to 0 or do not specify it.
476 .
477 - For each molecule, call Add() with its pointer and its position in the datafile.<br>
478 Currently the std::streampos value is implicitly cast to unsigned int so that
479 for 32bit machines the datafile has to be no longer than about 2Gbyte.
480 - The index file is written when the FastSearchIndexer object is deleted or goes out
481 of scope.
482
483 <h4>To search in a fastsearch index file</h4>
484
485 - Open a std::istream to the indexfile (in binary mode on some systems)
486 - Make a FastSearch object, read the index and open the datafile whose
487 name it provides
488 \code
489 ifstream ifs(indexname,ios::binary);
490 FastSearch fs;
491 string datafilename = fs.ReadIndex(&ifs);
492 if(datafilename.empty()/endcode
493 return false;
494 ifstream datastream(datafilename);
495 if(!datastream)
496 return false;
497 \endcode
498
499 <strong>To do a search for molecules which have all the substructure bits the
500 OBMol object, patternMol</strong>
501 \code
502 vector<unsigned int>& SeekPositions;
503 if(!fs.Find(patternMol, SeekPositions, MaxCandidates))
504
505 for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++)
506 {
507 datastream.seekg(*itr);
508 ... read the candidate molecule
509 and subject to more rigorous test if necessary
510 }
511 \endcode
512
513 <strong>To do a similarity search based on the Tanimoto coefficient</strong>
514 This is defined as: <br>
515 <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br>
516 where the boolean operations between the fingerprints are bitwise<br>
517 The Tanimoto coefficient has no absolute meaning and depends on
518 the design of the fingerprint.
519 \code
520 multimap<double, unsigned int> SeekposMap;
521 // to find n best molecules
522 fs.FindSimilar(&patternMol, SeekposMap, n);
523 ...or
524 // to find molecules with Tanimoto coefficient > MinTani
525 fs.FindSimilar(&patternMol, SeekposMap, MinTani);
526
527 multimap<double, unsigned int>::reverse_iterator itr;
528 for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
529 {
530 datastream.seekg(itr->second);
531 ... read the candidate molecule
532 double tani = itr->first;
533 }
534 \endcode
535
536 The FastSearchFormat class facilitates the use of these routine from the
537 command line or other front end program. For instance:
538
539 <strong>Prepare an index:</strong>
540 \code
541 babel datafile.xxx index.fs
542 \endcode
543 or
544 \code
545 babel datafile.xxx -ofs namedindex
546 \endcode
547 With options you can specify:
548 - which type of fingerprint to be used, e.g. -xfFP2,
549 - whether it is folded to a specified number of bits, e.g. -xn128
550 (which should be a power of2)
551 - whether to pre-select the molecules which are indexed:
552 - by structure e.g only ethers and esters, -sCOC
553 - by excluding molecules with bezene rings, -vc1ccccc1
554 - by position in the datafile e.g. only mols 10 to 90, -f10 -l90
555 .
556 <strong>Substructure search</strong> in a previously prepared index file
557 \code
558 babel index.fs outfile.yyy -sSMILES
559 \endcode
560 The datafile can also be used as the input file, provided the input format is specified as fs
561 \code
562 babel datafile.xxx outfile.yyy -sSMILES -ifs
563 \endcode
564 A "slow" search not using fingerprints would be done on the same data by omitting -ifs.
565 A "slow" search can use SMARTS strings, but the fast search is restricted
566 to the subset which is valid SMILES.
567
568 With the -S option, the target can be specified as a molecule in a file of any format
569 \code
570 babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs
571 \endcode
572 These searches have two stages: a fingerprint search which produces
573 a number of candidate molecules and a definitive search which selects
574 from these using SMARTS pattern matching. The maximum number of candidates
575 is 4000 by default but you can change this with an option
576 e.g. -al 8000 (Note that you need the space between l and the number.)
577 If the fingerprint search reaches the maximum number it will not
578 look further and will tell you at what stage it stopped.
579
580 <strong>Similarity search</strong> in a previously prepared index file<br>
581 This rather done (rather than a substructure search) if the -at option is used,
582 \code
583 babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7
584 \endcode
585 for instance
586 - -at0.7 will recover all molecules with Tanimoto greater than 0.7
587 - -at15 (no decimal point) will recover the 15 molecules with largest coefficients.
588 - -aa will add the Tanimoto coefficient to the titles of the output molecules.
589
590 All stages, the indexing, the interpretation of the SMILES string in the -s option,
591 the file in the -S option and the final SMARTS filter convert to OBMol and apply
592 ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O
593 are recognized as chemically identical.
594 */
595
596 }//Openbabel
597
598 //! \file fingerprint.cpp
599 //! \brief Definitions for OBFingerprint base class and fastsearch classes