ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/group/trunk/OOPSE-4/src/openbabel/pdbformat.cpp
Revision: 2440
Committed: Wed Nov 16 19:42:11 2005 UTC (18 years, 8 months ago) by tim
File size: 21294 byte(s)
Log Message:
adding openbabel

File Contents

# Content
1 /**********************************************************************
2 Copyright (C) 1998-2001 by OpenEye Scientific Software, Inc.
3 Some portions Copyright (C) 2003-2005 Geoffrey R. Hutchison
4 Some portions Copyright (C) 2004 by Chris Morley
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation version 2 of the License.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14 ***********************************************************************/
15
16 #include "babelconfig.hpp"
17 #include "mol.hpp"
18 #include "obconversion.hpp"
19 #include "obmolecformat.hpp"
20
21 #if !HAVE_SNPRINTF
22 extern "C" int snprintf( char *, size_t, const char *, /* args */ ...);
23 #endif
24
25 #include <vector>
26 #include <map>
27
28 #ifdef HAVE_SSTREAM
29 #include <sstream>
30 #else
31 #include <strstream>
32 #endif
33
34 using namespace std;
35 namespace OpenBabel
36 {
37
38 class PDBFormat : public OBMoleculeFormat
39 {
40 public:
41 //Register this format type ID
42 PDBFormat()
43 {
44 OBConversion::RegisterFormat("pdb",this, "chemical/x-pdb");
45 OBConversion::RegisterFormat("ent",this, "chemical/x-pdb");
46 }
47
48 virtual const char* Description() //required
49 {
50 return
51 "Protein Data Bank format\n \
52 Read Options e.g. -as\n\
53 s Output single bonds only\n\
54 b Disable bonding entirely\n\n";
55 };
56
57 virtual const char* SpecificationURL()
58 { return "http://www.rcsb.org/pdb/docs/format/pdbguide2.2/guide2.2_frame.html";};
59
60 virtual const char* GetMIMEType()
61 { return "chemical/x-pdb"; };
62
63 //Flags() can return be any the following combined by | or be omitted if none apply
64 // NOTREADABLE READONEONLY NOTWRITABLE WRITEONEONLY
65 virtual unsigned int Flags()
66 {
67 return READONEONLY;
68 };
69
70 //*** This section identical for most OBMol conversions ***
71 ////////////////////////////////////////////////////
72 /// The "API" interface functions
73 virtual bool ReadMolecule(OBBase* pOb, OBConversion* pConv);
74 virtual bool WriteMolecule(OBBase* pOb, OBConversion* pConv);
75
76 };
77 //***
78
79 //Make an instance of the format class
80 PDBFormat thePDBFormat;
81
82 /////////////////////////////////////////////////////////////////
83
84
85 static bool ParseAtomRecord(char *, OBMol &,int);
86 static bool ParseConectRecord(char *,OBMol &);
87
88 //extern OBResidueData resdat; now in mol.h
89
90 /////////////////////////////////////////////////////////////////
91 bool PDBFormat::ReadMolecule(OBBase* pOb, OBConversion* pConv)
92 {
93
94 OBMol* pmol = dynamic_cast<OBMol*>(pOb);
95 if(pmol==NULL)
96 return false;
97
98 //Define some references so we can use the old parameter names
99 istream &ifs = *pConv->GetInStream();
100 OBMol &mol = *pmol;
101 const char* title = pConv->GetTitle();
102
103 int chainNum = 1;
104 char buffer[BUFF_SIZE];
105 OBBitVec bs;
106
107 mol.SetTitle(title);
108
109 mol.BeginModify();
110 while (ifs.getline(buffer,BUFF_SIZE) && !EQn(buffer,"END",3))
111 {
112 if (EQn(buffer,"TER",3))
113 chainNum++;
114 if (EQn(buffer,"ATOM",4) || EQn(buffer,"HETATM",6))
115 {
116 ParseAtomRecord(buffer,mol,chainNum);
117 if (EQn(buffer,"ATOM",4))
118 bs.SetBitOn(mol.NumAtoms());
119 }
120
121 if (EQn(buffer,"CONECT",6))
122 ParseConectRecord(buffer,mol);
123 }
124
125 resdat.AssignBonds(mol,bs);
126 /*assign hetatm bonds based on distance*/
127
128 if (!pConv->IsOption("b",OBConversion::INOPTIONS))
129 mol.ConnectTheDots();
130
131 if (mol.NumAtoms() < 250) // Minimize time required on real proteins
132 if (!pConv->IsOption("s",OBConversion::INOPTIONS) && !pConv->IsOption("b",OBConversion::INOPTIONS))
133 mol.PerceiveBondOrders();
134
135 // clean out remaining blank lines
136 while(ifs.peek() != EOF && ifs.good() &&
137 (ifs.peek() == '\n' || ifs.peek() == '\r'))
138 ifs.getline(buffer,BUFF_SIZE);
139
140 mol.EndModify();
141
142 mol.SetAtomTypesPerceived();
143 atomtyper.AssignImplicitValence(mol);
144
145 if (!mol.NumAtoms())
146 return(false);
147 return(true);
148 }
149
150 ////////////////////////////////////////////////////////////////
151 static bool ParseAtomRecord(char *buffer, OBMol &mol,int chainNum)
152 /* ATOMFORMAT "(i5,1x,a4,a1,a3,1x,a1,i4,a1,3x,3f8.3,2f6.2,1x,i3)" */
153 {
154 string sbuf = &buffer[6];
155 if (sbuf.size() < 48)
156 return(false);
157
158 bool hetatm = (EQn(buffer,"HETATM",6)) ? true : false;
159
160 /* serial number */
161 string serno = sbuf.substr(0,5);
162 //SerialNum(the_atom) = atoi(tmp_str);
163
164 /* atom name */
165 string atmid = sbuf.substr(6,4);
166
167 /* element */
168 string element;
169 if (sbuf.size() > 71)
170 element = sbuf.substr(70,2);
171 else
172 element = " ";
173
174 //trim spaces on the right and left sides
175 while (!atmid.empty() && atmid[0] == ' ')
176 atmid = atmid.substr(1,atmid.size()-1);
177
178 while (!atmid.empty() && atmid[atmid.size()-1] == ' ')
179 atmid = atmid.substr(0,atmid.size()-1);
180
181 /* residue name */
182
183 string resname = sbuf.substr(11,3);
184 if (resname == " ")
185 resname = "UNK";
186 else
187 {
188 while (!resname.empty() && resname[0] == ' ')
189 resname = resname.substr(1,resname.size()-1);
190
191 while (!resname.empty() && resname[resname.size()-1] == ' ')
192 resname = resname.substr(0,resname.size()-1);
193 }
194
195 /* residue sequence number */
196
197 string resnum = sbuf.substr(16,4);
198
199 /* X, Y, Z */
200 string xstr = sbuf.substr(24,8);
201 string ystr = sbuf.substr(32,8);
202 string zstr = sbuf.substr(40,8);
203
204 string type;
205
206 if (EQn(buffer,"ATOM",4))
207 {
208 type = atmid.substr(0,2);
209 if (isdigit(type[0]))
210 type = atmid.substr(1,1);
211 else if (sbuf[6] == ' ' &&
212 strncasecmp(type.c_str(), "Zn", 2) != 0 &&
213 strncasecmp(type.c_str(), "Fe", 2) != 0)
214 type = atmid.substr(0,1); // one-character element
215
216
217 if (resname.substr(0,2) == "AS" || resname[0] == 'N')
218 {
219 if (atmid == "AD1")
220 type = "O";
221 if (atmid == "AD2")
222 type = "N";
223 }
224 if (resname.substr(0,3) == "HIS" || resname[0] == 'H')
225 {
226 if (atmid == "AD1" || atmid == "AE2")
227 type = "N";
228 if (atmid == "AE1" || atmid == "AD2")
229 type = "C";
230 }
231 if (resname.substr(0,2) == "GL" || resname[0] == 'Q')
232 {
233 if (atmid == "AE1")
234 type = "O";
235 if (atmid == "AE2")
236 type = "N";
237 }
238 }
239 else //must be hetatm record
240 {
241 if (isalpha(element[1]) && (isalpha(element[0]) || (element[0] == ' ')))
242 {
243 if (isalpha(element[0]))
244 type = element.substr(0,2);
245 else
246 type = element.substr(1,1);
247 if (type.size() == 2)
248 type[1] = tolower(type[1]);
249 }
250 else
251 {
252 if (isalpha(atmid[0]))
253 type = atmid.substr(0,2);
254 else if (atmid[0] == ' ')
255 type = atmid.substr(1,1); // one char element
256 else
257 type = atmid.substr(1,2);
258
259 if (atmid == resname)
260 {
261 type = atmid;
262 if (type.size() == 2)
263 type[1] = tolower(type[1]);
264 }
265 else
266 if (resname == "ADR" || resname == "COA" || resname == "FAD" ||
267 resname == "GPG" || resname == "NAD" || resname == "NAL" ||
268 resname == "NDP")
269 {
270 if (type.size() > 1)
271 type = type.substr(0,1);
272 //type.erase(1,type.size()-1);
273 }
274 else
275 if (isdigit(type[0]))
276 {
277 type = type.substr(1,1);
278 //type.erase(0,1);
279 //if (type.size() > 1) type.erase(1,type.size()-1);
280 }
281 else
282 if (type.size() > 1 && isdigit(type[1]))
283 type = type.substr(0,1);
284 //type.erase(1,1);
285 else
286 if (type.size() > 1 && isalpha(type[1]) && isupper(type[1]))
287 type[1] = tolower(type[1]);
288 }
289
290 }
291
292 OBAtom atom;
293 vector3 v(atof(xstr.c_str()),atof(ystr.c_str()),atof(zstr.c_str()));
294 atom.SetVector(v);
295
296 atom.SetAtomicNum(etab.GetAtomicNum(type.c_str()));
297 atom.SetType(type);
298
299 int rnum = atoi(resnum.c_str());
300 OBResidue *res = (mol.NumResidues() > 0) ? mol.GetResidue(mol.NumResidues()-1) : NULL;
301 if (res == NULL || res->GetName() != resname || static_cast<int>(res->GetNum())
302 != rnum)
303 {
304 vector<OBResidue*>::iterator ri;
305 for (res = mol.BeginResidue(ri) ; res ; res = mol.NextResidue(ri))
306 if (res->GetName() == resname && static_cast<int>(res->GetNum())
307 == rnum)
308 break;
309
310 if (res == NULL)
311 {
312 res = mol.NewResidue()
313 ;
314 res->SetChainNum(chainNum);
315 res->SetName(resname);
316 res->SetNum(rnum);
317 }
318 }
319
320 if (!mol.AddAtom(atom)
321 )
322 return(false);
323 else
324 {
325 OBAtom *atom = mol.GetAtom(mol.NumAtoms());
326
327 res->AddAtom(atom);
328 res->SetSerialNum(atom, atoi(serno.c_str()));
329 res->SetAtomID(atom, atmid);
330 res->SetHetAtom(atom, hetatm);
331
332 return(true);
333 }
334 }
335
336 /////////////////////////////////////////////////////////////////////////
337 //! Utility function to read a 5-digit integer starting from a specified column
338 /*! This function reads a 5-digit integer, starting from column
339 columnAsSpecifiedInPDB from the buffer, converts it to a long
340 integer, and returns either false or true, if the conversion was
341 successful or not. If the conversion was not successful, the target
342 is set to a random value.
343
344 For instance, the PDB Format Description for a CONECT record specifies
345
346 COLUMNS DATA TYPE FIELD DEFINITION
347 ---------------------------------------------------------------------------------
348 1 - 6 Record name "CONECT"
349 7 - 11 Integer serial Atom serial number
350 ...
351
352 To read the Atom serial number, you would call
353
354 long int target;
355 if ( readIntegerFromRecord(buffer, 7, &target) == false ) {
356 cerr << "Could not parse" << endl;
357 }
358
359 This function does not check the length of the buffer, or
360 strlen(buffer). If the buffer is not long enough => SEGFAULT.
361 */
362 static bool readIntegerFromRecord(char *buffer, unsigned int columnAsSpecifiedInPDB, long int *target)
363 {
364 char integerBuffer[6];
365 integerBuffer[5] = 0;
366
367 strncpy(integerBuffer, buffer+columnAsSpecifiedInPDB-1, 5);
368
369 char *errorCheckingEndPtr;
370 *target = strtol(integerBuffer, &errorCheckingEndPtr, 10);
371 if (integerBuffer == errorCheckingEndPtr)
372 return(false);
373 return(true);
374 }
375
376 //! Read a CONECT record
377 /*! This function reads a CONECT record, as specified
378 http://www.rcsb.org/pdb/docs/format/pdbguide2.2/guide2.2_frame.html,
379 in short:
380
381 COLUMNS DATA TYPE FIELD DEFINITION
382 ---------------------------------------------------------------------------------
383 1 - 6 Record name "CONECT"
384 7 - 11 Integer serial Atom serial number
385 12 - 16 Integer serial Serial number of bonded atom
386 17 - 21 Integer serial Serial number of bonded atom
387 22 - 26 Integer serial Serial number of bonded atom
388 27 - 31 Integer serial Serial number of bonded atom
389 32 - 36 Integer serial Serial number of hydrogen bonded atom
390 37 - 41 Integer serial Serial number of hydrogen bonded atom
391 42 - 46 Integer serial Serial number of salt bridged atom
392 47 - 51 Integer serial Serial number of hydrogen bonded atom
393 52 - 56 Integer serial Serial number of hydrogen bonded atom
394 57 - 61 Integer serial Serial number of salt bridged atom
395
396 Hydrogen bonds and salt bridges are ignored. --Stefan Kebekus.
397 */
398
399 static bool ParseConectRecord(char *buffer,OBMol &mol)
400 {
401 #ifdef HAVE_SSTREAM
402 stringstream errorMsg;
403 #else
404 strstream errorMsg;
405 #endif
406
407 // Setup strings and string buffers
408 buffer[70] = '\0';
409 if (strlen(buffer) < 70)
410 {
411 errorMsg << "WARNING: Problems reading a PDB file\n"
412 << " Problems reading a CONECT record.\n"
413 << " According to the PDB specification,\n"
414 << " the record should have 70 columns, but OpenBabel found "
415 << strlen(buffer) << " columns.";
416 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obInfo);
417 }
418
419 // Serial number of the first atom, read from column 7-11 of the
420 // connect record, to which the other atoms connect to.
421 long int startAtomSerialNumber;
422 if (readIntegerFromRecord(buffer, 7, &startAtomSerialNumber) == false)
423 {
424 errorMsg << "WARNING: Problems reading a PDB file\n"
425 << " Problems reading a CONECT record.\n"
426 << " According to the PDB specification,\n"
427 << " columns 7-11 should contain the serial number of an atom.\n"
428 << " THIS CONECT RECORD WILL BE IGNORED.";
429 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obWarning);
430 return(false);
431 }
432
433 // Find a pointer to the first atom.
434 OBAtom *firstAtom = 0L;
435 vector<OBNodeBase*>::iterator i;
436 for (OBAtom *a1 = mol.BeginAtom(i);a1;a1 = mol.NextAtom(i))
437 if (static_cast<long int>(a1->GetResidue()->
438 GetSerialNum(a1)) == startAtomSerialNumber)
439 {
440 firstAtom = a1;
441 break;
442 }
443 if (firstAtom == 0L)
444 {
445 errorMsg << "WARNING: Problems reading a PDB file:\n"
446 << " Problems reading a CONECT record.\n"
447 << " According to the PDB specification,\n"
448 << " columns 7-11 should contain the serial number of an atom.\n"
449 << " No atom was found with this serial number.\n"
450 << " THIS CONECT RECORD WILL BE IGNORED.";
451 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obWarning);
452 return(false);
453 }
454
455 // Serial numbers of the atoms which bind to firstAtom, read from
456 // columns 12-16, 17-21, 22-27 and 27-31 of the connect record. Note
457 // that we reserve space for 5 integers, but read only four of
458 // them. This is to simplify the determination of the bond order;
459 // see below.
460 long int boundedAtomsSerialNumbers[5] = {0,0,0,0,0};
461 // Bools which tell us which of the serial numbers in
462 // boundedAtomsSerialNumbers are read from the file, and which are
463 // invalid
464 bool boundedAtomsSerialNumbersValid[5] = {false, false, false, false, false};
465
466 // Now read the serial numbers. If the first serial number is not
467 // present, this connect record probably contains only hydrogen
468 // bonds and salt bridges, which we ignore. In that case, we just
469 // exit gracefully.
470 boundedAtomsSerialNumbersValid[0] = readIntegerFromRecord(buffer, 12, boundedAtomsSerialNumbers+0);
471 if (boundedAtomsSerialNumbersValid[0] == false)
472 return(true);
473 boundedAtomsSerialNumbersValid[1] = readIntegerFromRecord(buffer, 17, boundedAtomsSerialNumbers+1);
474 boundedAtomsSerialNumbersValid[2] = readIntegerFromRecord(buffer, 22, boundedAtomsSerialNumbers+2);
475 boundedAtomsSerialNumbersValid[3] = readIntegerFromRecord(buffer, 27, boundedAtomsSerialNumbers+3);
476
477 // Now iterate over the VALID boundedAtomsSerialNumbers and connect
478 // the atoms.
479 for(unsigned int k=0; boundedAtomsSerialNumbersValid[k]; k++)
480 {
481 // Find atom that is connected to, write an error message
482 OBAtom *connectedAtom = 0L;
483 for (OBAtom *a1 = mol.BeginAtom(i);a1;a1 = mol.NextAtom(i))
484 if (static_cast<long int>(a1->GetResidue()->
485 GetSerialNum(a1)) == boundedAtomsSerialNumbers[k])
486 {
487 connectedAtom = a1;
488 break;
489 }
490 if (connectedAtom == 0L)
491 {
492 errorMsg << "WARNING: Problems reading a PDB file:\n"
493 << " Problems reading a CONECT record.\n"
494 << " According to the PDB specification,\n"
495 << " Atoms with serial #" << startAtomSerialNumber
496 << " and #" << boundedAtomsSerialNumbers[k]
497 << " should be connected\n"
498 << " However, an atom with serial #" << boundedAtomsSerialNumbers[k] << " was not found.\n"
499 << " THIS CONECT RECORD WILL BE IGNORED.";
500 obErrorLog.ThrowError(__FUNCTION__, errorMsg.str() , obWarning);
501 break;
502 }
503
504 // Figure the bond order
505 unsigned char order = 0;
506 while(boundedAtomsSerialNumbersValid[k+order+1] && (boundedAtomsSerialNumbers[k+order]
507 == boundedAtomsSerialNumbers[k+order+1]))
508 order++;
509 k += order;
510
511 // Generate the bond
512 mol.AddBond(firstAtom->GetIdx(), connectedAtom->GetIdx(), order+1);
513 }
514 return(true);
515 }
516
517 //////////////////////////////////////////////////////////////////////////////
518 bool PDBFormat::WriteMolecule(OBBase* pOb, OBConversion* pConv)
519 {
520 OBMol* pmol = dynamic_cast<OBMol*>(pOb);
521 if(pmol==NULL)
522 return false;
523
524 //Define some references so we can use the old parameter names
525 ostream &ofs = *pConv->GetOutStream();
526 OBMol &mol = *pmol;
527
528 unsigned int i;
529 char buffer[BUFF_SIZE];
530 char type_name[10], padded_name[10];
531 char the_res[10];
532 char *element_name;
533 int res_num;
534 bool het=true;
535
536 // sprintf(buffer,"HEADER PROTEIN");
537 // ofs << buffer << endl;
538
539 if (strlen(mol.GetTitle()) > 0)
540 sprintf(buffer,"COMPND %s ",mol.GetTitle());
541 else
542 sprintf(buffer,"COMPND UNNAMED");
543 ofs << buffer << endl;
544
545 sprintf(buffer,"AUTHOR GENERATED BY OPEN BABEL %s",BABEL_VERSION);
546 ofs << buffer << endl;
547
548 OBAtom *atom;
549 OBResidue *res;
550 for (i = 1; i <= mol.NumAtoms(); i++)
551 {
552 atom = mol.GetAtom(i);
553 strcpy(type_name,etab.GetSymbol(atom->GetAtomicNum()));
554
555 //two char. elements are on position 13 and 14 one char. start at 14
556 if (strlen(type_name) > 1)
557 type_name[1] = toupper(type_name[1]);
558 else
559 {
560 char tmp[10];
561 strcpy(tmp, type_name);
562 sprintf(type_name, " %-3s", tmp);
563 }
564
565 if ( (res = atom->GetResidue()) )
566 {
567 het = res->IsHetAtom(atom);
568 snprintf(the_res,4,"%s",(char*)res->GetName().c_str());
569 snprintf(type_name,5,"%s",(char*)res->GetAtomID(atom).c_str());
570
571 //two char. elements are on position 13 and 14 one char. start at 14
572 if (strlen(etab.GetSymbol(atom->GetAtomicNum())) == 1)
573 {
574 if (strlen(type_name) < 4)
575 {
576 char tmp[10];
577 strcpy(tmp, type_name);
578 sprintf(padded_name," %-3s", tmp);
579 strncpy(type_name,padded_name,4);
580 type_name[4] = '\0';
581 }
582 else
583 {
584 type_name[4] = type_name[3];
585 type_name[3] = type_name[2];
586 type_name[2] = type_name[1];
587 type_name[1] = type_name[0];
588 type_name[0] = type_name[4];
589 type_name[4] = '\0';
590 }
591 }
592 res_num = res->GetNum();
593 }
594 else
595 {
596 strcpy(the_res,"UNK");
597 sprintf(padded_name,"%s",type_name);
598 strncpy(type_name,padded_name,4);
599 type_name[4] = '\0';
600 res_num = 1;
601 }
602
603 element_name = etab.GetSymbol(atom->GetAtomicNum());
604 if (strlen(element_name) == 2)
605 element_name[1] = toupper(element_name[1]);
606 sprintf(buffer,"%s%5d %-4s %-3s %4d %8.3f%8.3f%8.3f 1.00 0.00 %2s \n",
607 het?"HETATM":"ATOM ",
608 i,
609 type_name,
610 the_res,
611 res_num,
612 atom->GetX(),
613 atom->GetY(),
614 atom->GetZ(),
615 element_name);
616 ofs << buffer;
617 }
618
619 OBAtom *nbr;
620 int count;
621 vector<OBEdgeBase*>::iterator k;
622 for (i = 1; i <= mol.NumAtoms(); i ++)
623 {
624 atom = mol.GetAtom(i);
625 if (atom->GetValence() <= 4)
626 {
627 sprintf(buffer,"CONECT%5d", i);
628 ofs << buffer;
629 for (nbr = atom->BeginNbrAtom(k);nbr;nbr = atom->NextNbrAtom(k))
630 {
631 sprintf(buffer,"%5d", nbr->GetIdx());
632 ofs << buffer;
633 }
634 for (count = 0; count < (4 - (int)atom->GetValence()); count++)
635 {
636 sprintf(buffer, " ");
637 ofs << buffer;
638 }
639 ofs << " " << endl;
640 }
641 }
642 sprintf(buffer,"MASTER 0 0 0 0 0 0 0 0 ");
643 ofs << buffer;
644 sprintf(buffer,"%4d 0 %4d 0",mol.NumAtoms(),mol.NumAtoms());
645 ofs << buffer << endl;
646 sprintf(buffer,"END");
647 ofs << buffer << endl;
648 return(true);
649 }
650
651
652 } //namespace OpenBabel