ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/group/trunk/OOPSE-4/src/openbabel/fingerprint.cpp
Revision: 2440
Committed: Wed Nov 16 19:42:11 2005 UTC (18 years, 8 months ago) by tim
File size: 20980 byte(s)
Log Message:
adding openbabel

File Contents

# User Rev Content
1 tim 2440 /**********************************************************************
2     fingerprint.cpp - Implementation of fingerpring base class and fastsearching
3    
4     Copyright (C) 2005 by Chris Morley
5    
6     This file is part of the Open Babel project.
7     For more information, see <http://openbabel.sourceforge.net/>
8    
9     This program is free software; you can redistribute it and/or modify
10     it under the terms of the GNU General Public License as published by
11     the Free Software Foundation version 2 of the License.
12    
13     This program is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16     GNU General Public License for more details.
17     ***********************************************************************/
18    
19     #include "babelconfig.hpp"
20     #include <vector>
21     #include <algorithm>
22    
23     #if HAVE_IOSTREAM
24     #include <iostream>
25     #elif HAVE_IOSTREAM_H
26     #include <iostream.h>
27     #endif
28    
29     #if HAVE_FSTREAM
30     #include <fstream>
31     #elif HAVE_FSTREAM_H
32     #include <fstream.h>
33     #endif
34     #include <vector>
35    
36     #include "fingerprint.hpp"
37     #include "oberror.hpp"
38    
39     using namespace std;
40     namespace OpenBabel {
41    
42     OBFingerprint* OBFingerprint::_pDefault; //static variable
43     const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int);
44    
45     void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n)
46     {
47     vec[n/bitsperint] |= (1 << (n % bitsperint));
48     }
49    
50     ////////////////////////////////////////
51     void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits)
52     {
53     while(vec.size()*bitsperint/2 >= nbits)
54     vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2,
55     vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end());
56     }
57    
58     ////////////////////////////////////////
59     bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt)
60     {
61     Fptpos iter;
62     if(id.empty())
63     iter=FPtsMap().begin();
64     else
65     {
66     iter=FPtsMap().find(id);
67     if(iter!=FPtsMap().end())
68     ++iter;
69     }
70     if(iter==FPtsMap().end())
71     return false;
72     id = iter->first;
73     pFPrt = iter->second;
74     return true;
75     }
76    
77     OBFingerprint* OBFingerprint::FindFingerprint(string& ID)
78     {
79     if(ID.empty())
80     return _pDefault;
81     Fptpos iter = FPtsMap().find(ID);
82     if(iter==FPtsMap().end())
83     return NULL;
84     else
85     return iter->second;
86     }
87    
88     double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2)
89     {
90     //Independent of sizeof(unsigned int)
91     if(vec1.size()!=vec2.size())
92     return -1; //different number of bits
93     int andbits=0, orbits=0;
94     for (unsigned i=0;i<vec1.size();++i)
95     {
96     int andfp = vec1[i] & vec2[i];
97     int orfp = vec1[i] | vec2[i];
98     //Count bits
99     for(;andfp;andfp=andfp<<1)
100     if(andfp<0) ++andbits;
101     for(;orfp;orfp=orfp<<1)
102     if(orfp<0) ++orbits;
103     }
104     return((double)andbits/(double)orbits);
105     }
106    
107     //*****************************************************************8
108     /*!
109    
110     */
111     bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions,
112     unsigned int MaxCandidates)
113     {
114     ///Finds chemical objects in datafilename (which must previously have been indexed)
115     ///that have all the same bits set in their fingerprints as in the fingerprint of
116     ///a pattern object. (Usually a substructure search in molecules.)
117     ///The type of fingerprint and its degree of folding does not have to be specified
118     ///here because the values in the index file are used.
119     ///The positions of the candidate matching molecules in the original datafile are returned.
120    
121     vector<unsigned int> vecwords;
122     _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::bitsperint);
123    
124     vector<unsigned int>candidates; //indices of matches from fingerprint screen
125     candidates.reserve(MaxCandidates);
126    
127     unsigned int dataSize = _index.header.nEntries;
128     // GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype);
129    
130     unsigned int words = _index.header.words;
131     unsigned int* nextp = &_index.fptdata[0];
132     unsigned int* ppat0 = &vecwords[0];
133     register unsigned int* p;
134     register unsigned int* ppat;
135     register unsigned int a;
136     unsigned int i; // need address of this, can't be register
137     for(i=0;i<dataSize; i++) //speed critical section
138     {
139     p=nextp;
140     nextp += words;
141     ppat=ppat0;
142     a=0;
143     while(p<nextp)
144     {
145     if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break;
146     }
147     if(!a)
148     {
149     candidates.push_back(i);
150     if(candidates.size()>=MaxCandidates)
151     break;
152     }
153     }
154    
155     if(i<_index.header.nEntries) //premature end to search
156     {
157     #ifdef HAVE_SSTREAM
158     stringstream errorMsg;
159     #else
160     strstream errorMsg;
161     #endif
162     errorMsg << "Stopped looking after " << i << " molecules." << endl;
163     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obInfo);
164     }
165    
166     vector<unsigned int>::iterator itr;
167     for(itr=candidates.begin();itr!=candidates.end();itr++)
168     {
169     SeekPositions.push_back(_index.seekdata[*itr]);
170     }
171     return true;
172     }
173    
174     /////////////////////////////////////////////////////////
175     bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap,
176     double MinTani)
177     {
178     vector<unsigned int> targetfp;
179     _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint);
180    
181     unsigned int words = _index.header.words;
182     unsigned int dataSize = _index.header.nEntries;
183     unsigned int* nextp = &_index.fptdata[0];
184     register unsigned int* p;
185     register unsigned int i;
186     for(i=0;i<dataSize; i++) //speed critical section
187     {
188     p=nextp;
189     nextp += words;
190     double tani = OBFingerprint::Tanimoto(targetfp,p);
191     if(tani>MinTani)
192     SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i]));
193     }
194     return true;
195     }
196    
197     /////////////////////////////////////////////////////////
198     bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap,
199     int nCandidates)
200     {
201     ///If nCandidates is zero or omitted the original size of the multimap is used
202     if(nCandidates)
203     {
204     //initialise the multimap with nCandidate zero entries
205     SeekposMap.clear();
206     int i;
207     for(i=0;i<nCandidates;++i)
208     SeekposMap.insert(pair<const double, unsigned int>(0,0));
209     }
210     else if(SeekposMap.size()==0)
211     return false;
212    
213     vector<unsigned int> targetfp;
214     _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint);
215    
216     unsigned int words = _index.header.words;
217     unsigned int dataSize = _index.header.nEntries;
218     unsigned int* nextp = &_index.fptdata[0];
219     register unsigned int* p;
220     register unsigned int i;
221     for(i=0;i<dataSize; i++) //speed critical section
222     {
223     p=nextp;
224     nextp += words;
225     double tani = OBFingerprint::Tanimoto(targetfp,p);
226     if(tani>SeekposMap.begin()->first)
227     {
228     SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i]));
229     SeekposMap.erase(SeekposMap.begin());
230     }
231     }
232     return true;
233     }/////////////////////////////////////////////////////////
234     string FastSearch::ReadIndex(istream* pIndexstream)
235     {
236     //Reads fs index from istream into member variables
237     // but first checks whether it is already loaded
238     FptIndexHeader headercopy = _index.header;
239     pIndexstream->read((char*)&(_index.header), sizeof(FptIndexHeader));
240    
241     if(memcmp(&headercopy,&(_index.header),sizeof(FptIndexHeader)))
242     {
243     pIndexstream->seekg(_index.header.headerlength);//allows header length to be changed
244    
245     unsigned int nwords = _index.header.nEntries * _index.header.words;
246     _index.fptdata.resize(nwords);
247     _index.seekdata.resize(_index.header.nEntries);
248    
249     pIndexstream->read((char*)&(_index.fptdata[0]), sizeof(unsigned int) * nwords);
250     pIndexstream->read((char*)&(_index.seekdata[0]), sizeof(unsigned int) * _index.header.nEntries);
251    
252     if(pIndexstream->fail())
253     *(_index.header.datafilename) = '\0';
254    
255     string tempFP(_index.header.fpid);
256     _pFP = OBFingerprint::FindFingerprint(tempFP);
257     if(!_pFP)
258     {
259     #ifdef HAVE_SSTREAM
260     stringstream errorMsg;
261     #else
262     strstream errorMsg;
263     #endif
264     errorMsg << "Index has Fingerprints of type '" << _index.header.fpid
265     << " which is not currently loaded." << endl;
266     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning);
267     *(_index.header.datafilename) = '\0';
268     }
269    
270     }
271     return _index.header.datafilename;
272     }
273    
274     //*******************************************************
275     FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os,
276     std::string& fpid, int FptBits)
277     {
278     ///Starts indexing process
279     _indexstream = os;
280     _pFP = OBFingerprint::FindFingerprint(fpid);
281     if(!_pFP)
282     {
283     #ifdef HAVE_SSTREAM
284     stringstream errorMsg;
285     #else
286     strstream errorMsg;
287     #endif
288     errorMsg << "Fingerprint type '" << fpid << "' not available" << endl;
289     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning);
290     }
291    
292     _nbits=FptBits;
293     _pindex= new FptIndex;
294     _pindex->header.headerlength = sizeof(FptIndexHeader);
295     strncpy(_pindex->header.fpid,fpid.c_str(),15);
296     strncpy(_pindex->header.datafilename, datafilename.c_str(), 255);
297     }
298    
299     /////////////////////////////////////////////////////////////
300     FastSearchIndexer::~FastSearchIndexer()
301     {
302     ///Saves index file
303     _pindex->header.nEntries = _pindex->seekdata.size();
304     _indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader));
305     _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int));
306     _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int));
307     if(!_indexstream)
308     obErrorLog.ThrowError(__FUNCTION__,
309     "Difficulty writing index", obWarning);
310     delete _pindex;
311     }
312    
313     ///////////////////////////////////////////////////////////////
314     bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos)
315     {
316     ///Adds a fingerprint
317    
318     vector<unsigned int> vecwords;
319     if(!_pFP)
320     return false;
321     if(_pFP->GetFingerprint(pOb, vecwords, _nbits))
322     {
323     _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint
324     for(unsigned int i=0;i<_pindex->header.words;i++)
325     _pindex->fptdata.push_back(vecwords[i]);
326     _pindex->seekdata.push_back(seekpos);
327     return true;
328     }
329     obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning);
330     return false;
331    
332     }
333    
334     /*!
335     \class OBFingerprint
336     These fingerprints are condensed representation of molecules (or other objects)
337     as a list of boolean values (actually bits in a vector<unsigned>) with length
338     of a power of 2. The main motivation is for fast searching of data sources
339     containing large numbers of molecules (up to several million). OpenBabel
340     provides some routines which can search text files containing lists of molecules
341     in any format. See the documentation on the class FastSearch.
342    
343     There are descriptions of molecular fingerprints at <br>
344     http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br>
345     http://www.mesaac.com/Fingerprint.htm <br>
346     Many methods of preparing fingerprints have been described, but the type supported
347     currently in OpenBabel has each bit representing a substructure (or other
348     molecular property). If a substructure is present in the molecule, then a
349     particular bit is set to 1. But because the hashing method may also map other
350     substructures to the same bit, a match does not guarantee that a particular
351     substructure is present; there may be false positives. However, with proper design,
352     a large fraction of irrelevant molecules in a data set can be eliminated in a
353     fast search with boolean methods on the fingerprints.
354     It then becomes feasible to make a definitive substructure search by
355     conventional methods on this reduced list even if it is slow.
356    
357     OpenBabel provides a framework for applying new types of fingerprints without
358     changing any existing code. They are derived from OBFingerprint and the
359     source file is just compiled with the rest of OpenBabel. Alternatively,
360     they can be separately compiled as a DLL or shared library and discovered
361     when OpenBabel runs.
362    
363     Fingerprints derived from this abstract base class OBFingerprint can be for any
364     object derived from OBBase (not just for OBMol).
365     Each derived class provides an ID as a string and OBFingerprint keeps a map of
366     these to provides a pointer to the class when requested in FindFingerprint.
367    
368     <h4>-- To define a fingerprint type --</h4>
369     The classes derived form OBFingerprint are required to provide
370     a GetFingerprint() routine and a Description() routine
371     \code
372     class MyFpType : OBFingerprint
373     {
374     MyFpType(const char* id) : OBFingerprint(id){};
375     virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits)
376     {
377     //Convert pOb to the required type, usually OBMol
378     OBMol* pmol = dynamic_cast<OBMol*>(pOb);
379     fp.resize(required_number_of_words);
380     ...
381     use SetBit(fp,n); to set the nth bit
382    
383     if(nbits)
384     Fold(fp, nbits);
385     }
386     virtual const char* Description(){ return "Some descriptive text";}
387     ...
388     };
389     \endcode
390     //Declare a global instance with the ID you will use in -f options to specify its use.
391     \code
392     MyFpType theMyFpType("myfpID");
393     \endcode
394    
395     <h4>-- To obtain a fingerprint --</h4>
396     \code
397     OBMol mol;
398     ...
399     vector<unsigned int> fp;
400     OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint
401     \endcode
402     or
403     \code
404     vector<unsigned int> fp;
405     OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID");
406     ...and maybe...
407     pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger
408     \endcode
409    
410     <h4>-- To print a list of available fingerprint types --</h4>
411     \code
412     std::string id;
413     OBFingerPrint* pFPrt=NULL;
414     while(OBFingerprint::GetNextFPrt(id, pFPrt))
415     {
416     cout << id << " -- " << pFPrt->Description() << endl;
417     }
418     \endcode
419    
420    
421     Fingerprints are handled as vector<unsigned int> so that the number of bits
422     in this vector and their order will be platform and compiler
423     dependent, because of size of int and endian differences. Use fingerprints
424     (and fastsearch indexes containing them) only for comparing with other
425     fingerprints prepared on the same machine.
426    
427     The FingerprintFormat class is an output format which displays fingerprints
428     as hexadecimal. When multiple molecules are supplied it will calculate the
429     Tanimoto coefficient from the first molecule to each of the others. It also
430     shows whether the first molecule is a possible substructure to all the others,
431     i.e. whether all the bits set in the fingerprint for the first molecule are
432     set in the fingerprint of the others. To display hexadecimal information when
433     multiple molecules are provided it is necessay to use the -xh option.
434    
435     To see a list of available format types, type babel -F on the command line.
436     The -xF option of the FingerprintFormat class also provides this output, but due
437     to a quirk in the way the program works, it is necessary to have a valid input
438     molecule for this option to work.
439     */
440    
441     /*! \class FastSearch
442     The FastSearch class searches an index to a datafile containing a list of molecules
443     (or other objects) prepared by FastSearchIndexer.
444    
445     OpenBabel can also search files for molecules containing a substructure specified
446     by a SMARTS string, using OBSmartsPattern or from the command line:
447     \code
448     babel datafile.xxx -outfile.yyy -sSMARTS
449     \endcode
450     But when the data file contains more than about 10,000 molecules this becomes
451     rather too slow to be used interactively. To do it more quickly, an index
452     of the molecules containing their fingerprints (see OBFingerprint) is prepared using
453     FastSearchIndexer. The indexing process may take a long time but only has to
454     be done once. The index can be searched very quickly with FastSearch. Because
455     of the nature of fingerprints a match to a bit does not guarantee
456     the presence of a particular substructure or other molecular property, so that
457     a definitive answer may require a subsequent search of the (much reduced) list
458     of candidates by another method (like OBSmartsPattern).
459    
460     Note that the index files are not portable. They need to be prepared on the
461     computer that will acess them.
462    
463     <h4>Using FastSearch and FastSearchIndexer in a program</h4>
464    
465     The index has two tables:
466     - an array of fingerprints of the molecules
467     - an array of the seek positions in the datasource file of all the molecules
468    
469     <h4>To prepare an fastsearch index file:</h4>
470     - Open an ostream to the index file.
471     - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters:
472     - the datafile name, the indexfile stream,
473     - the id of the fingerprint type to be used,
474     - the number of bits the fingerprint is to be folded down to, If it is to be left
475     unfolded, set fpid to 0 or do not specify it.
476     .
477     - For each molecule, call Add() with its pointer and its position in the datafile.<br>
478     Currently the std::streampos value is implicitly cast to unsigned int so that
479     for 32bit machines the datafile has to be no longer than about 2Gbyte.
480     - The index file is written when the FastSearchIndexer object is deleted or goes out
481     of scope.
482    
483     <h4>To search in a fastsearch index file</h4>
484    
485     - Open a std::istream to the indexfile (in binary mode on some systems)
486     - Make a FastSearch object, read the index and open the datafile whose
487     name it provides
488     \code
489     ifstream ifs(indexname,ios::binary);
490     FastSearch fs;
491     string datafilename = fs.ReadIndex(&ifs);
492     if(datafilename.empty()/endcode
493     return false;
494     ifstream datastream(datafilename);
495     if(!datastream)
496     return false;
497     \endcode
498    
499     <strong>To do a search for molecules which have all the substructure bits the
500     OBMol object, patternMol</strong>
501     \code
502     vector<unsigned int>& SeekPositions;
503     if(!fs.Find(patternMol, SeekPositions, MaxCandidates))
504    
505     for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++)
506     {
507     datastream.seekg(*itr);
508     ... read the candidate molecule
509     and subject to more rigorous test if necessary
510     }
511     \endcode
512    
513     <strong>To do a similarity search based on the Tanimoto coefficient</strong>
514     This is defined as: <br>
515     <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br>
516     where the boolean operations between the fingerprints are bitwise<br>
517     The Tanimoto coefficient has no absolute meaning and depends on
518     the design of the fingerprint.
519     \code
520     multimap<double, unsigned int> SeekposMap;
521     // to find n best molecules
522     fs.FindSimilar(&patternMol, SeekposMap, n);
523     ...or
524     // to find molecules with Tanimoto coefficient > MinTani
525     fs.FindSimilar(&patternMol, SeekposMap, MinTani);
526    
527     multimap<double, unsigned int>::reverse_iterator itr;
528     for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
529     {
530     datastream.seekg(itr->second);
531     ... read the candidate molecule
532     double tani = itr->first;
533     }
534     \endcode
535    
536     The FastSearchFormat class facilitates the use of these routine from the
537     command line or other front end program. For instance:
538    
539     <strong>Prepare an index:</strong>
540     \code
541     babel datafile.xxx index.fs
542     \endcode
543     or
544     \code
545     babel datafile.xxx -ofs namedindex
546     \endcode
547     With options you can specify:
548     - which type of fingerprint to be used, e.g. -xfFP2,
549     - whether it is folded to a specified number of bits, e.g. -xn128
550     (which should be a power of2)
551     - whether to pre-select the molecules which are indexed:
552     - by structure e.g only ethers and esters, -sCOC
553     - by excluding molecules with bezene rings, -vc1ccccc1
554     - by position in the datafile e.g. only mols 10 to 90, -f10 -l90
555     .
556     <strong>Substructure search</strong> in a previously prepared index file
557     \code
558     babel index.fs outfile.yyy -sSMILES
559     \endcode
560     The datafile can also be used as the input file, provided the input format is specified as fs
561     \code
562     babel datafile.xxx outfile.yyy -sSMILES -ifs
563     \endcode
564     A "slow" search not using fingerprints would be done on the same data by omitting -ifs.
565     A "slow" search can use SMARTS strings, but the fast search is restricted
566     to the subset which is valid SMILES.
567    
568     With the -S option, the target can be specified as a molecule in a file of any format
569     \code
570     babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs
571     \endcode
572     These searches have two stages: a fingerprint search which produces
573     a number of candidate molecules and a definitive search which selects
574     from these using SMARTS pattern matching. The maximum number of candidates
575     is 4000 by default but you can change this with an option
576     e.g. -al 8000 (Note that you need the space between l and the number.)
577     If the fingerprint search reaches the maximum number it will not
578     look further and will tell you at what stage it stopped.
579    
580     <strong>Similarity search</strong> in a previously prepared index file<br>
581     This rather done (rather than a substructure search) if the -at option is used,
582     \code
583     babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7
584     \endcode
585     for instance
586     - -at0.7 will recover all molecules with Tanimoto greater than 0.7
587     - -at15 (no decimal point) will recover the 15 molecules with largest coefficients.
588     - -aa will add the Tanimoto coefficient to the titles of the output molecules.
589    
590     All stages, the indexing, the interpretation of the SMILES string in the -s option,
591     the file in the -S option and the final SMARTS filter convert to OBMol and apply
592     ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O
593     are recognized as chemically identical.
594     */
595    
596     }//Openbabel
597    
598     //! \file fingerprint.cpp
599     //! \brief Definitions for OBFingerprint base class and fastsearch classes