layer2/CifFile.cpp (263 lines of code) (raw):
/*
* CIF tokenizer
*
* All keys are canonicalized to lowercase
*
* (c) 2014 Schrodinger, Inc.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include <iostream>
#include "CifFile.h"
#include "File.h"
#include "MemoryDebug.h"
#include "strcasecmp.h"
// basic IO and string handling
/*
* atof which ignores uncertainty notation
* 1.23(45)e2 -> 1.23e2
*/
double scifloat(const char *str) {
const char *close, *open = strchr(str, '(');
if (open && (close = strchr(open, ')'))) {
double value;
char *copy = strdup(str);
strcpy(copy + (open - str), close + 1);
value = atof(copy);
free(copy);
return value;
}
return atof(str);
}
// Return true if "c" is whitespace or null
static bool iswhitespace0(char c) {
return strchr(" \t\r\n", c) ? true : false;
}
// Return true if "c" is whitespace
static bool iswhitespace(char c) {
return (c && iswhitespace0(c));
}
// Return true if "c" is line feed or carriage return
static bool islinefeed(char c) {
return (c == '\r' || c == '\n');
}
// Return true if "c" is line feed or carriage return or null
static bool islinefeed0(char c) {
return (!c || islinefeed(c));
}
// Return true if "c" is double or single quote
static bool isquote(char c) {
return (c == '"' || c == '\'');
}
// FreeBSD name conflict
#ifdef isspecial
#undef isspecial
#endif
// Return true if token is a STAR keyword
static bool isspecial(const char *token) {
return (token[0] == '_'
|| strncasecmp("data_", token, 5) == 0
|| strncasecmp("save_", token, 5) == 0
|| strcasecmp("loop_", token) == 0
|| strcasecmp("stop_", token) == 0
|| strcasecmp("global_", token) == 0);
}
// convert all chars to lowercase
static void tolowerinplace(char *p) {
for (; *p; p++) {
if (*p <= 'Z' && *p >= 'A')
*p -= 'Z' - 'z';
}
}
// CIF stuff
static const char * EMPTY_STRING = "";
static cif_array EMPTY_ARRAY(NULL);
/*
* Class to store CIF loops. Only for parsing, do not use in any higher level
* reading functions.
*/
class cif_loop {
public:
int ncols;
int nrows;
const char **values;
// methods
const char * get_value_raw(int row, int col) const;
};
// get table value, return NULL if indices out of bounds
const char * cif_loop::get_value_raw(int row, int col) const {
if (row >= nrows)
return NULL;
return values[row * ncols + col];
}
// get the number of elements in this array
int cif_array::get_nrows() const {
return (col < 0) ? 1 : pointer.loop->nrows;
}
// get array value, return NULL if row-index out of bounds
// or value in ['.', '?']
const char * cif_array::get_value(int row) const {
if (col < 0)
return (row > 0) ? NULL : pointer.value;
return pointer.loop->get_value_raw(row, col);
}
// get array value, return an empty string if missing
const char * cif_array::as_s(int row) const {
const char * s = get_value(row);
return s ? s : EMPTY_STRING;
}
// get array value as integer, return d (default 0) if missing
int cif_array::as_i(int row, int d) const {
const char * s = get_value(row);
return s ? atoi(s) : d;
}
// get array value as double, return d (default 0.0) if missing
double cif_array::as_d(int row, double d) const {
const char * s = get_value(row);
return s ? scifloat(s) : d;
}
// true if all values in ['.', '?']
bool cif_array::is_missing_all() const {
int n = get_nrows();
for (int i = 0; i < n; ++i) {
if (!is_missing(i))
return false;
}
return true;
}
// templated getters
template <> const char* cif_array::as<const char* >(int row) const { return get_value(row); }
template <> std::string cif_array::as<std::string >(int row) const { return as_s(row); }
template <> int cif_array::as<int >(int row) const { return as_i(row); }
template <> double cif_array::as<double >(int row) const { return as_d(row); }
template <> float cif_array::as<float >(int row) const { return as_d(row); }
/*
* Get a pointer to array or NULL if not found
*
* Can lookup up to 3 different aliases, the first one found is returned.
* Also supports an alias shortcut for the trivial case where mmCIF uses
* a colon and CIF uses an underscore: (key="_foo?bar") is identical to
* (key="_foo.bar", alias1="_foo_bar")
*/
const cif_array * cif_data::get_arr(const char * key, const char * alias1, const char * alias2) const {
const char * p;
const char * aliases[] = {alias1, alias2, NULL};
m_str_cifarray_t::const_iterator it;
for (int j = 0; key; key = aliases[j++]) {
// support alias shortcut: '?' matches '.' and '_'
if ((p = strchr(key, '?'))) {
std::string tmp(key);
for (const char * d = "._"; *d; ++d) {
// replace '?' by '.' or '_'
tmp[p - key] = *d;
if ((it = dict.find(tmp.c_str())) != dict.end())
return &it->second;
}
} else {
if ((it = dict.find(key)) != dict.end())
return &it->second;
}
}
return NULL;
}
// Get a pointer to array or to a default value if not found
const cif_array * cif_data::get_opt(const char * key, const char * alias1, const char * alias2) const {
const cif_array * arr = get_arr(key, alias1, alias2);
if (arr == NULL)
return &EMPTY_ARRAY;
return arr;
}
// constructor
cif_file::cif_file(const char* filename, const char* contents_) {
if (contents_) {
contents = mstrdup(contents_);
} else {
contents = FileGetContents(filename, NULL);
if (!contents)
std::cerr << "ERROR: Failed to load file '" << filename << "'" << std::endl;
}
if (contents)
parse();
}
// destructor
cif_file::~cif_file() {
for (m_str_cifdatap_t::iterator it = datablocks.begin(),
it_end = datablocks.end(); it != it_end; ++it)
delete it->second;
if (contents)
mfree(contents);
}
// destructor
cif_data::~cif_data() {
for (m_str_cifdatap_t::iterator it = saveframes.begin(),
it_end = saveframes.end(); it != it_end; ++it)
delete it->second;
for (v_cifloopp_t::iterator it = loops.begin(),
it_end = loops.end(); it != it_end; ++it)
delete *it;
}
// parse CIF contents
bool cif_file::parse() {
char *p = contents;
char quote;
char prev = '\0';
std::vector<bool> keypossible;
// tokenize
while (true) {
while (iswhitespace(*p))
prev = *(p++);
if (!*p)
break;
if (*p == '#') {
while (!(islinefeed0(*++p)));
prev = *p;
} else if (isquote(*p)) { // will NULL the closing quote
quote = *p;
keypossible.push_back(false);
tokens.push_back(p + 1);
while (*++p && !(*p == quote && iswhitespace0(p[1])));
if (*p)
*(p++) = 0;
prev = *p;
} else if (*p == ';' && islinefeed(prev)) { // will NULL the line feed before the closing semicolon
keypossible.push_back(false);
tokens.push_back(p + 1);
while (*++p && !(islinefeed(*p) && p[1] == ';'));
if (*p) {
*p = 0;
p += 2;
}
prev = ';';
} else { // will null the whitespace
char * q = p++;
while (!iswhitespace0(*p)) ++p;
prev = *p;
if (p - q == 1 && (*q == '?' || *q == '.')) {
// store values '.' (inapplicable) and '?' (unknown) as null-pointers
q = NULL;
keypossible.push_back(false);
} else {
if (*p)
*(p++) = 0;
keypossible.push_back(true);
}
tokens.push_back(q);
}
}
cif_data *current_data = NULL, *current_frame = NULL, *global_block = NULL;
// parse into dictionary
for (unsigned int i = 0, n = tokens.size(); i < n; i++) {
if (!keypossible[i]) {
std::cout << "ERROR" << std::endl;
break;
} else if (tokens[i][0] == '_') {
if (i + 1 == n) {
std::cout << "ERROR truncated" << std::endl;
break;
}
if (current_frame) {
tolowerinplace(tokens[i]);
current_frame->dict[tokens[i]].set_value(tokens[i + 1]);
}
i++;
} else if (strcasecmp("loop_", tokens[i]) == 0) {
int ncols = 0;
int nrows = 0;
cif_loop *loop = NULL;
// loop data
if (current_frame) {
loop = new cif_loop;
// add to loops list
current_frame->loops.push_back(loop);
}
// columns
while (++i < n && keypossible[i] && tokens[i][0] == '_') {
tolowerinplace(tokens[i]);
if (current_frame) {
current_frame->dict[tokens[i]].set_loop(loop, ncols);
}
ncols++;
}
if (loop) {
// loop data
loop->values = (const char **) &tokens[i];
loop->ncols = ncols;
}
// rows
while (i < n && !(keypossible[i] && isspecial(tokens[i]))) {
i += ncols;
if (i > n) {
std::cout << "ERROR truncated loop" << std::endl;
break;
}
nrows++;
}
// loop data
if (loop) {
loop->nrows = nrows;
}
i--;
} else if (strncasecmp("data_", tokens[i], 5) == 0) {
const char * key(tokens[i] + 5);
datablocks[key] = current_data = current_frame = new cif_data;
} else if (strncasecmp("global_", tokens[i], 5) == 0) {
// STAR feature, not supported in CIF
global_block = current_data = current_frame = new cif_data;
} else if (strncasecmp("save_", tokens[i], 5) == 0) {
if (tokens[i][5] && current_data) {
// begin
const char * key(tokens[i] + 5);
current_data->saveframes[key] = current_frame = new cif_data;
} else {
// end
current_frame = current_data;
}
} else {
std::cout << "ERROR" << std::endl;
break;
}
}
if (global_block)
delete global_block;
return true;
}
// vi:sw=2:ts=2