lib/lisp/lexer.cpp

   1 //  $Id$
   2 //
   3 //  Copyright (C) 2004 Matthias Braun <matze@braunis.de>
   4 //  code in this file based on lispreader from Mark Probst
   5 //
   6 //  This program is free software; you can redistribute it and/or
   7 //  modify it under the terms of the GNU General Public License
   8 //  as published by the Free Software Foundation; either version 2
   9 //  of the License, or (at your option) any later version.
  10 //
  11 //  This program is distributed in the hope that it will be useful,
  12 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 //  GNU General Public License for more details.
  15 //
  16 //  You should have received a copy of the GNU General Public License
  17 //  along with this program; if not, write to the Free Software
  18 //  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19 #include <config.h>
  20
  21 #include <sstream>
  22 #include <stdexcept>
  23
  24 #include "lexer.h"
  25
  26 namespace lisp
  27 {
  28
  29 class EOFException
  30 {
  31 };
  32
  33 Lexer::Lexer(std::istream& newstream)
  34     : stream(newstream), eof(false), linenumber(0)
  35 {
  36   try {
  37     // trigger a refill of the buffer
  38     c = 0;
  39     bufend = c + 1;
  40     nextChar();
  41   } catch(EOFException& e) {
  42   }
  43 }
  44
  45 Lexer::~Lexer()
  46 {
  47 }
  48
  49 void
  50 Lexer::nextChar()
  51 {
  52   ++c;
  53   if(c >= bufend) {
  54     if(eof)
  55       throw EOFException();
  56     std::streamsize n = stream.readsome(buffer, BUFFER_SIZE);
  57
  58     c = buffer;
  59     bufend = buffer + n;
  60
  61     // the following is a hack that appends an additional ' ' at the end of
  62     // the file to avoid problems when parsing symbols/elements and a sudden
  63     // EOF. This is faster than relying on unget and IMO also nicer.
  64     if(n == 0 || stream.eof()) {
  65       eof = true;
  66       *bufend = ' ';
  67       ++bufend;
  68     }
  69   }
  70 }
  71
  72 Lexer::TokenType
  73 Lexer::getNextToken()
  74 {
  75   static const char* delims = "\"();";
  76
  77   try {
  78     while(isspace(*c)) {
  79       if(*c == '\n')
  80         ++linenumber;
  81       nextChar();
  82     };
  83
  84     token_length = 0;
  85
  86     switch(*c) {
  87       case ';': // comment
  88         while(!stream.eof()) {
  89           nextChar();
  90           if(*c == '\n') {
  91             ++linenumber;
  92             break;
  93           }
  94         }
  95         return getNextToken(); // and again
  96       case '(':
  97         nextChar();
  98         return TOKEN_OPEN_PAREN;
  99       case ')':
 100         nextChar();
 101         return TOKEN_CLOSE_PAREN;
 102       case '"': {  // string
 103         int startline = linenumber;
 104         try {
 105           while(1) {
 106             if(stream.eof()) {
 107               std::stringstream msg;
 108               msg << "Parse Error in line " << startline << ": "
 109                 << "Couldn't find end of string.";
 110               throw std::runtime_error(msg.str());
 111             }
 112             nextChar();
 113             if(*c == '"')
 114               break;
 115             else if(*c == '\n')
 116               linenumber++;
 117             else if(*c == '\\') {
 118               nextChar();
 119               switch(*c) {
 120                 case 'n':
 121                   *c = '\n';
 122                   break;
 123                 case 't':
 124                   *c = '\t';
 125                   break;
 126               }
 127             }
 128             if(token_length < MAX_TOKEN_LENGTH)
 129               token_string[token_length++] = *c;
 130           }
 131           token_string[token_length] = 0;
 132         } catch(EOFException& ) {
 133           std::stringstream msg;
 134           msg << "Parse error in line " << startline << ": "
 135             << "EOF while parsing string.";
 136           throw std::runtime_error(msg.str());
 137         }
 138         nextChar();
 139         return TOKEN_STRING;
 140       }
 141       case '#': // constant
 142         try {
 143           nextChar();
 144
 145           while(isalnum(*c) || *c == '_') {
 146             if(token_length < MAX_TOKEN_LENGTH)
 147               token_string[token_length++] = *c;
 148             nextChar();
 149           }
 150           token_string[token_length] = 0;
 151         } catch(EOFException& ) {
 152           std::stringstream msg;
 153           msg << "Parse Error in line " << linenumber << ": "
 154             << "EOF while parsing constant.";
 155           throw std::runtime_error(msg.str());
 156         }
 157
 158         if(strcmp(token_string, "t") == 0)
 159           return TOKEN_TRUE;
 160         if(strcmp(token_string, "f") == 0)
 161           return TOKEN_FALSE;
 162
 163         // we only handle #t and #f constants at the moment...
 164
 165         {
 166           std::stringstream msg;
 167           msg << "Parse Error in line " << linenumber << ": "
 168             << "Unknown constant '" << token_string << "'.";
 169           throw std::runtime_error(msg.str());
 170         }
 171
 172       default:
 173         if(isdigit(*c) || *c == '-') {
 174           bool have_nondigits = false;
 175           bool have_digits = false;
 176           int have_floating_point = 0;
 177
 178           do {
 179             if(isdigit(*c))
 180               have_digits = true;
 181             else if(*c == '.')
 182               ++have_floating_point;
 183             else if(isalnum(*c) || *c == '_')
 184               have_nondigits = true;
 185
 186             if(token_length < MAX_TOKEN_LENGTH)
 187               token_string[token_length++] = *c;
 188
 189             nextChar();
 190           } while(!isspace(*c) && !strchr(delims, *c));
 191
 192           token_string[token_length] = 0;
 193
 194           // no nextChar
 195
 196           if(have_nondigits || !have_digits || have_floating_point > 1)
 197             return TOKEN_SYMBOL;
 198           else if(have_floating_point == 1)
 199             return TOKEN_REAL;
 200           else
 201             return TOKEN_INTEGER;
 202         } else {
 203           do {
 204             if(token_length < MAX_TOKEN_LENGTH)
 205               token_string[token_length++] = *c;
 206             nextChar();
 207           } while(!isspace(*c) && !strchr(delims, *c));
 208           token_string[token_length] = 0;
 209
 210           // no nextChar
 211
 212           return TOKEN_SYMBOL;
 213         }
 214     }
 215   } catch(EOFException& ) {
 216     return TOKEN_EOF;
 217   }
 218 }
 219
 220 } // end of namespace lisp
 221