ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/group/trunk/OOPSE-4/src/openbabel/pdbformat.cpp
Revision: 2440
Committed: Wed Nov 16 19:42:11 2005 UTC (18 years, 8 months ago) by tim
File size: 21294 byte(s)
Log Message:
adding openbabel

File Contents

# User Rev Content
1 tim 2440 /**********************************************************************
2     Copyright (C) 1998-2001 by OpenEye Scientific Software, Inc.
3     Some portions Copyright (C) 2003-2005 Geoffrey R. Hutchison
4     Some portions Copyright (C) 2004 by Chris Morley
5    
6     This program is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation version 2 of the License.
9    
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     GNU General Public License for more details.
14     ***********************************************************************/
15    
16     #include "babelconfig.hpp"
17     #include "mol.hpp"
18     #include "obconversion.hpp"
19     #include "obmolecformat.hpp"
20    
21     #if !HAVE_SNPRINTF
22     extern "C" int snprintf( char *, size_t, const char *, /* args */ ...);
23     #endif
24    
25     #include <vector>
26     #include <map>
27    
28     #ifdef HAVE_SSTREAM
29     #include <sstream>
30     #else
31     #include <strstream>
32     #endif
33    
34     using namespace std;
35     namespace OpenBabel
36     {
37    
38     class PDBFormat : public OBMoleculeFormat
39     {
40     public:
41     //Register this format type ID
42     PDBFormat()
43     {
44     OBConversion::RegisterFormat("pdb",this, "chemical/x-pdb");
45     OBConversion::RegisterFormat("ent",this, "chemical/x-pdb");
46     }
47    
48     virtual const char* Description() //required
49     {
50     return
51     "Protein Data Bank format\n \
52     Read Options e.g. -as\n\
53     s Output single bonds only\n\
54     b Disable bonding entirely\n\n";
55     };
56    
57     virtual const char* SpecificationURL()
58     { return "http://www.rcsb.org/pdb/docs/format/pdbguide2.2/guide2.2_frame.html";};
59    
60     virtual const char* GetMIMEType()
61     { return "chemical/x-pdb"; };
62    
63     //Flags() can return be any the following combined by | or be omitted if none apply
64     // NOTREADABLE READONEONLY NOTWRITABLE WRITEONEONLY
65     virtual unsigned int Flags()
66     {
67     return READONEONLY;
68     };
69    
70     //*** This section identical for most OBMol conversions ***
71     ////////////////////////////////////////////////////
72     /// The "API" interface functions
73     virtual bool ReadMolecule(OBBase* pOb, OBConversion* pConv);
74     virtual bool WriteMolecule(OBBase* pOb, OBConversion* pConv);
75    
76     };
77     //***
78    
79     //Make an instance of the format class
80     PDBFormat thePDBFormat;
81    
82     /////////////////////////////////////////////////////////////////
83    
84    
85     static bool ParseAtomRecord(char *, OBMol &,int);
86     static bool ParseConectRecord(char *,OBMol &);
87    
88     //extern OBResidueData resdat; now in mol.h
89    
90     /////////////////////////////////////////////////////////////////
91     bool PDBFormat::ReadMolecule(OBBase* pOb, OBConversion* pConv)
92     {
93    
94     OBMol* pmol = dynamic_cast<OBMol*>(pOb);
95     if(pmol==NULL)
96     return false;
97    
98     //Define some references so we can use the old parameter names
99     istream &ifs = *pConv->GetInStream();
100     OBMol &mol = *pmol;
101     const char* title = pConv->GetTitle();
102    
103     int chainNum = 1;
104     char buffer[BUFF_SIZE];
105     OBBitVec bs;
106    
107     mol.SetTitle(title);
108    
109     mol.BeginModify();
110     while (ifs.getline(buffer,BUFF_SIZE) && !EQn(buffer,"END",3))
111     {
112     if (EQn(buffer,"TER",3))
113     chainNum++;
114     if (EQn(buffer,"ATOM",4) || EQn(buffer,"HETATM",6))
115     {
116     ParseAtomRecord(buffer,mol,chainNum);
117     if (EQn(buffer,"ATOM",4))
118     bs.SetBitOn(mol.NumAtoms());
119     }
120    
121     if (EQn(buffer,"CONECT",6))
122     ParseConectRecord(buffer,mol);
123     }
124    
125     resdat.AssignBonds(mol,bs);
126     /*assign hetatm bonds based on distance*/
127    
128     if (!pConv->IsOption("b",OBConversion::INOPTIONS))
129     mol.ConnectTheDots();
130    
131     if (mol.NumAtoms() < 250) // Minimize time required on real proteins
132     if (!pConv->IsOption("s",OBConversion::INOPTIONS) && !pConv->IsOption("b",OBConversion::INOPTIONS))
133     mol.PerceiveBondOrders();
134    
135     // clean out remaining blank lines
136     while(ifs.peek() != EOF && ifs.good() &&
137     (ifs.peek() == '\n' || ifs.peek() == '\r'))
138     ifs.getline(buffer,BUFF_SIZE);
139    
140     mol.EndModify();
141    
142     mol.SetAtomTypesPerceived();
143     atomtyper.AssignImplicitValence(mol);
144    
145     if (!mol.NumAtoms())
146     return(false);
147     return(true);
148     }
149    
150     ////////////////////////////////////////////////////////////////
151     static bool ParseAtomRecord(char *buffer, OBMol &mol,int chainNum)
152     /* ATOMFORMAT "(i5,1x,a4,a1,a3,1x,a1,i4,a1,3x,3f8.3,2f6.2,1x,i3)" */
153     {
154     string sbuf = &buffer[6];
155     if (sbuf.size() < 48)
156     return(false);
157    
158     bool hetatm = (EQn(buffer,"HETATM",6)) ? true : false;
159    
160     /* serial number */
161     string serno = sbuf.substr(0,5);
162     //SerialNum(the_atom) = atoi(tmp_str);
163    
164     /* atom name */
165     string atmid = sbuf.substr(6,4);
166    
167     /* element */
168     string element;
169     if (sbuf.size() > 71)
170     element = sbuf.substr(70,2);
171     else
172     element = " ";
173    
174     //trim spaces on the right and left sides
175     while (!atmid.empty() && atmid[0] == ' ')
176     atmid = atmid.substr(1,atmid.size()-1);
177    
178     while (!atmid.empty() && atmid[atmid.size()-1] == ' ')
179     atmid = atmid.substr(0,atmid.size()-1);
180    
181     /* residue name */
182    
183     string resname = sbuf.substr(11,3);
184     if (resname == " ")
185     resname = "UNK";
186     else
187     {
188     while (!resname.empty() && resname[0] == ' ')
189     resname = resname.substr(1,resname.size()-1);
190    
191     while (!resname.empty() && resname[resname.size()-1] == ' ')
192     resname = resname.substr(0,resname.size()-1);
193     }
194    
195     /* residue sequence number */
196    
197     string resnum = sbuf.substr(16,4);
198    
199     /* X, Y, Z */
200     string xstr = sbuf.substr(24,8);
201     string ystr = sbuf.substr(32,8);
202     string zstr = sbuf.substr(40,8);
203    
204     string type;
205    
206     if (EQn(buffer,"ATOM",4))
207     {
208     type = atmid.substr(0,2);
209     if (isdigit(type[0]))
210     type = atmid.substr(1,1);
211     else if (sbuf[6] == ' ' &&
212     strncasecmp(type.c_str(), "Zn", 2) != 0 &&
213     strncasecmp(type.c_str(), "Fe", 2) != 0)
214     type = atmid.substr(0,1); // one-character element
215    
216    
217     if (resname.substr(0,2) == "AS" || resname[0] == 'N')
218     {
219     if (atmid == "AD1")
220     type = "O";
221     if (atmid == "AD2")
222     type = "N";
223     }
224     if (resname.substr(0,3) == "HIS" || resname[0] == 'H')
225     {
226     if (atmid == "AD1" || atmid == "AE2")
227     type = "N";
228     if (atmid == "AE1" || atmid == "AD2")
229     type = "C";
230     }
231     if (resname.substr(0,2) == "GL" || resname[0] == 'Q')
232     {
233     if (atmid == "AE1")
234     type = "O";
235     if (atmid == "AE2")
236     type = "N";
237     }
238     }
239     else //must be hetatm record
240     {
241     if (isalpha(element[1]) && (isalpha(element[0]) || (element[0] == ' ')))
242     {
243     if (isalpha(element[0]))
244     type = element.substr(0,2);
245     else
246     type = element.substr(1,1);
247     if (type.size() == 2)
248     type[1] = tolower(type[1]);
249     }
250     else
251     {
252     if (isalpha(atmid[0]))
253     type = atmid.substr(0,2);
254     else if (atmid[0] == ' ')
255     type = atmid.substr(1,1); // one char element
256     else
257     type = atmid.substr(1,2);
258    
259     if (atmid == resname)
260     {
261     type = atmid;
262     if (type.size() == 2)
263     type[1] = tolower(type[1]);
264     }
265     else
266     if (resname == "ADR" || resname == "COA" || resname == "FAD" ||
267     resname == "GPG" || resname == "NAD" || resname == "NAL" ||
268     resname == "NDP")
269     {
270     if (type.size() > 1)
271     type = type.substr(0,1);
272     //type.erase(1,type.size()-1);
273     }
274     else
275     if (isdigit(type[0]))
276     {
277     type = type.substr(1,1);
278     //type.erase(0,1);
279     //if (type.size() > 1) type.erase(1,type.size()-1);
280     }
281     else
282     if (type.size() > 1 && isdigit(type[1]))
283     type = type.substr(0,1);
284     //type.erase(1,1);
285     else
286     if (type.size() > 1 && isalpha(type[1]) && isupper(type[1]))
287     type[1] = tolower(type[1]);
288     }
289    
290     }
291    
292     OBAtom atom;
293     vector3 v(atof(xstr.c_str()),atof(ystr.c_str()),atof(zstr.c_str()));
294     atom.SetVector(v);
295    
296     atom.SetAtomicNum(etab.GetAtomicNum(type.c_str()));
297     atom.SetType(type);
298    
299     int rnum = atoi(resnum.c_str());
300     OBResidue *res = (mol.NumResidues() > 0) ? mol.GetResidue(mol.NumResidues()-1) : NULL;
301     if (res == NULL || res->GetName() != resname || static_cast<int>(res->GetNum())
302     != rnum)
303     {
304     vector<OBResidue*>::iterator ri;
305     for (res = mol.BeginResidue(ri) ; res ; res = mol.NextResidue(ri))
306     if (res->GetName() == resname && static_cast<int>(res->GetNum())
307     == rnum)
308     break;
309    
310     if (res == NULL)
311     {
312     res = mol.NewResidue()
313     ;
314     res->SetChainNum(chainNum);
315     res->SetName(resname);
316     res->SetNum(rnum);
317     }
318     }
319    
320     if (!mol.AddAtom(atom)
321     )
322     return(false);
323     else
324     {
325     OBAtom *atom = mol.GetAtom(mol.NumAtoms());
326    
327     res->AddAtom(atom);
328     res->SetSerialNum(atom, atoi(serno.c_str()));
329     res->SetAtomID(atom, atmid);
330     res->SetHetAtom(atom, hetatm);
331    
332     return(true);
333     }
334     }
335    
336     /////////////////////////////////////////////////////////////////////////
337     //! Utility function to read a 5-digit integer starting from a specified column
338     /*! This function reads a 5-digit integer, starting from column
339     columnAsSpecifiedInPDB from the buffer, converts it to a long
340     integer, and returns either false or true, if the conversion was
341     successful or not. If the conversion was not successful, the target
342     is set to a random value.
343    
344     For instance, the PDB Format Description for a CONECT record specifies
345    
346     COLUMNS DATA TYPE FIELD DEFINITION
347     ---------------------------------------------------------------------------------
348     1 - 6 Record name "CONECT"
349     7 - 11 Integer serial Atom serial number
350     ...
351    
352     To read the Atom serial number, you would call
353    
354     long int target;
355     if ( readIntegerFromRecord(buffer, 7, &target) == false ) {
356     cerr << "Could not parse" << endl;
357     }
358    
359     This function does not check the length of the buffer, or
360     strlen(buffer). If the buffer is not long enough => SEGFAULT.
361     */
362     static bool readIntegerFromRecord(char *buffer, unsigned int columnAsSpecifiedInPDB, long int *target)
363     {
364     char integerBuffer[6];
365     integerBuffer[5] = 0;
366    
367     strncpy(integerBuffer, buffer+columnAsSpecifiedInPDB-1, 5);
368    
369     char *errorCheckingEndPtr;
370     *target = strtol(integerBuffer, &errorCheckingEndPtr, 10);
371     if (integerBuffer == errorCheckingEndPtr)
372     return(false);
373     return(true);
374     }
375    
376     //! Read a CONECT record
377     /*! This function reads a CONECT record, as specified
378     http://www.rcsb.org/pdb/docs/format/pdbguide2.2/guide2.2_frame.html,
379     in short:
380    
381     COLUMNS DATA TYPE FIELD DEFINITION
382     ---------------------------------------------------------------------------------
383     1 - 6 Record name "CONECT"
384     7 - 11 Integer serial Atom serial number
385     12 - 16 Integer serial Serial number of bonded atom
386     17 - 21 Integer serial Serial number of bonded atom
387     22 - 26 Integer serial Serial number of bonded atom
388     27 - 31 Integer serial Serial number of bonded atom
389     32 - 36 Integer serial Serial number of hydrogen bonded atom
390     37 - 41 Integer serial Serial number of hydrogen bonded atom
391     42 - 46 Integer serial Serial number of salt bridged atom
392     47 - 51 Integer serial Serial number of hydrogen bonded atom
393     52 - 56 Integer serial Serial number of hydrogen bonded atom
394     57 - 61 Integer serial Serial number of salt bridged atom
395    
396     Hydrogen bonds and salt bridges are ignored. --Stefan Kebekus.
397     */
398    
399     static bool ParseConectRecord(char *buffer,OBMol &mol)
400     {
401     #ifdef HAVE_SSTREAM
402     stringstream errorMsg;
403     #else
404     strstream errorMsg;
405     #endif
406    
407     // Setup strings and string buffers
408     buffer[70] = '\0';
409     if (strlen(buffer) < 70)
410     {
411     errorMsg << "WARNING: Problems reading a PDB file\n"
412     << " Problems reading a CONECT record.\n"
413     << " According to the PDB specification,\n"
414     << " the record should have 70 columns, but OpenBabel found "
415     << strlen(buffer) << " columns.";
416     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obInfo);
417     }
418    
419     // Serial number of the first atom, read from column 7-11 of the
420     // connect record, to which the other atoms connect to.
421     long int startAtomSerialNumber;
422     if (readIntegerFromRecord(buffer, 7, &startAtomSerialNumber) == false)
423     {
424     errorMsg << "WARNING: Problems reading a PDB file\n"
425     << " Problems reading a CONECT record.\n"
426     << " According to the PDB specification,\n"
427     << " columns 7-11 should contain the serial number of an atom.\n"
428     << " THIS CONECT RECORD WILL BE IGNORED.";
429     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obWarning);
430     return(false);
431     }
432    
433     // Find a pointer to the first atom.
434     OBAtom *firstAtom = 0L;
435     vector<OBNodeBase*>::iterator i;
436     for (OBAtom *a1 = mol.BeginAtom(i);a1;a1 = mol.NextAtom(i))
437     if (static_cast<long int>(a1->GetResidue()->
438     GetSerialNum(a1)) == startAtomSerialNumber)
439     {
440     firstAtom = a1;
441     break;
442     }
443     if (firstAtom == 0L)
444     {
445     errorMsg << "WARNING: Problems reading a PDB file:\n"
446     << " Problems reading a CONECT record.\n"
447     << " According to the PDB specification,\n"
448     << " columns 7-11 should contain the serial number of an atom.\n"
449     << " No atom was found with this serial number.\n"
450     << " THIS CONECT RECORD WILL BE IGNORED.";
451     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obWarning);
452     return(false);
453     }
454    
455     // Serial numbers of the atoms which bind to firstAtom, read from
456     // columns 12-16, 17-21, 22-27 and 27-31 of the connect record. Note
457     // that we reserve space for 5 integers, but read only four of
458     // them. This is to simplify the determination of the bond order;
459     // see below.
460     long int boundedAtomsSerialNumbers[5] = {0,0,0,0,0};
461     // Bools which tell us which of the serial numbers in
462     // boundedAtomsSerialNumbers are read from the file, and which are
463     // invalid
464     bool boundedAtomsSerialNumbersValid[5] = {false, false, false, false, false};
465    
466     // Now read the serial numbers. If the first serial number is not
467     // present, this connect record probably contains only hydrogen
468     // bonds and salt bridges, which we ignore. In that case, we just
469     // exit gracefully.
470     boundedAtomsSerialNumbersValid[0] = readIntegerFromRecord(buffer, 12, boundedAtomsSerialNumbers+0);
471     if (boundedAtomsSerialNumbersValid[0] == false)
472     return(true);
473     boundedAtomsSerialNumbersValid[1] = readIntegerFromRecord(buffer, 17, boundedAtomsSerialNumbers+1);
474     boundedAtomsSerialNumbersValid[2] = readIntegerFromRecord(buffer, 22, boundedAtomsSerialNumbers+2);
475     boundedAtomsSerialNumbersValid[3] = readIntegerFromRecord(buffer, 27, boundedAtomsSerialNumbers+3);
476    
477     // Now iterate over the VALID boundedAtomsSerialNumbers and connect
478     // the atoms.
479     for(unsigned int k=0; boundedAtomsSerialNumbersValid[k]; k++)
480     {
481     // Find atom that is connected to, write an error message
482     OBAtom *connectedAtom = 0L;
483     for (OBAtom *a1 = mol.BeginAtom(i);a1;a1 = mol.NextAtom(i))
484     if (static_cast<long int>(a1->GetResidue()->
485     GetSerialNum(a1)) == boundedAtomsSerialNumbers[k])
486     {
487     connectedAtom = a1;
488     break;
489     }
490     if (connectedAtom == 0L)
491     {
492     errorMsg << "WARNING: Problems reading a PDB file:\n"
493     << " Problems reading a CONECT record.\n"
494     << " According to the PDB specification,\n"
495     << " Atoms with serial #" << startAtomSerialNumber
496     << " and #" << boundedAtomsSerialNumbers[k]
497     << " should be connected\n"
498     << " However, an atom with serial #" << boundedAtomsSerialNumbers[k] << " was not found.\n"
499     << " THIS CONECT RECORD WILL BE IGNORED.";
500     obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obWarning);
501     break;
502     }
503    
504     // Figure the bond order
505     unsigned char order = 0;
506     while(boundedAtomsSerialNumbersValid[k+order+1] && (boundedAtomsSerialNumbers[k+order]
507     == boundedAtomsSerialNumbers[k+order+1]))
508     order++;
509     k += order;
510    
511     // Generate the bond
512     mol.AddBond(firstAtom->GetIdx(), connectedAtom->GetIdx(), order+1);
513     }
514     return(true);
515     }
516    
517     //////////////////////////////////////////////////////////////////////////////
518     bool PDBFormat::WriteMolecule(OBBase* pOb, OBConversion* pConv)
519     {
520     OBMol* pmol = dynamic_cast<OBMol*>(pOb);
521     if(pmol==NULL)
522     return false;
523    
524     //Define some references so we can use the old parameter names
525     ostream &ofs = *pConv->GetOutStream();
526     OBMol &mol = *pmol;
527    
528     unsigned int i;
529     char buffer[BUFF_SIZE];
530     char type_name[10], padded_name[10];
531     char the_res[10];
532     char *element_name;
533     int res_num;
534     bool het=true;
535    
536     // sprintf(buffer,"HEADER PROTEIN");
537     // ofs << buffer << endl;
538    
539     if (strlen(mol.GetTitle()) > 0)
540     sprintf(buffer,"COMPND %s ",mol.GetTitle());
541     else
542     sprintf(buffer,"COMPND UNNAMED");
543     ofs << buffer << endl;
544    
545     sprintf(buffer,"AUTHOR GENERATED BY OPEN BABEL %s",BABEL_VERSION);
546     ofs << buffer << endl;
547    
548     OBAtom *atom;
549     OBResidue *res;
550     for (i = 1; i <= mol.NumAtoms(); i++)
551     {
552     atom = mol.GetAtom(i);
553     strcpy(type_name,etab.GetSymbol(atom->GetAtomicNum()));
554    
555     //two char. elements are on position 13 and 14 one char. start at 14
556     if (strlen(type_name) > 1)
557     type_name[1] = toupper(type_name[1]);
558     else
559     {
560     char tmp[10];
561     strcpy(tmp, type_name);
562     sprintf(type_name, " %-3s", tmp);
563     }
564    
565     if ( (res = atom->GetResidue()) )
566     {
567     het = res->IsHetAtom(atom);
568     snprintf(the_res,4,"%s",(char*)res->GetName().c_str());
569     snprintf(type_name,5,"%s",(char*)res->GetAtomID(atom).c_str());
570    
571     //two char. elements are on position 13 and 14 one char. start at 14
572     if (strlen(etab.GetSymbol(atom->GetAtomicNum())) == 1)
573     {
574     if (strlen(type_name) < 4)
575     {
576     char tmp[10];
577     strcpy(tmp, type_name);
578     sprintf(padded_name," %-3s", tmp);
579     strncpy(type_name,padded_name,4);
580     type_name[4] = '\0';
581     }
582     else
583     {
584     type_name[4] = type_name[3];
585     type_name[3] = type_name[2];
586     type_name[2] = type_name[1];
587     type_name[1] = type_name[0];
588     type_name[0] = type_name[4];
589     type_name[4] = '\0';
590     }
591     }
592     res_num = res->GetNum();
593     }
594     else
595     {
596     strcpy(the_res,"UNK");
597     sprintf(padded_name,"%s",type_name);
598     strncpy(type_name,padded_name,4);
599     type_name[4] = '\0';
600     res_num = 1;
601     }
602    
603     element_name = etab.GetSymbol(atom->GetAtomicNum());
604     if (strlen(element_name) == 2)
605     element_name[1] = toupper(element_name[1]);
606     sprintf(buffer,"%s%5d %-4s %-3s %4d %8.3f%8.3f%8.3f 1.00 0.00 %2s \n",
607     het?"HETATM":"ATOM ",
608     i,
609     type_name,
610     the_res,
611     res_num,
612     atom->GetX(),
613     atom->GetY(),
614     atom->GetZ(),
615     element_name);
616     ofs << buffer;
617     }
618    
619     OBAtom *nbr;
620     int count;
621     vector<OBEdgeBase*>::iterator k;
622     for (i = 1; i <= mol.NumAtoms(); i ++)
623     {
624     atom = mol.GetAtom(i);
625     if (atom->GetValence() <= 4)
626     {
627     sprintf(buffer,"CONECT%5d", i);
628     ofs << buffer;
629     for (nbr = atom->BeginNbrAtom(k);nbr;nbr = atom->NextNbrAtom(k))
630     {
631     sprintf(buffer,"%5d", nbr->GetIdx());
632     ofs << buffer;
633     }
634     for (count = 0; count < (4 - (int)atom->GetValence()); count++)
635     {
636     sprintf(buffer, " ");
637     ofs << buffer;
638     }
639     ofs << " " << endl;
640     }
641     }
642     sprintf(buffer,"MASTER 0 0 0 0 0 0 0 0 ");
643     ofs << buffer;
644     sprintf(buffer,"%4d 0 %4d 0",mol.NumAtoms(),mol.NumAtoms());
645     ofs << buffer << endl;
646     sprintf(buffer,"END");
647     ofs << buffer << endl;
648     return(true);
649     }
650    
651    
652     } //namespace OpenBabel