Logo Search packages:      
Sourcecode: passepartout version File versions  Download package

pdfparser.cc

#include "pdfparser.h"
#include "util/stringutil.h"
#include <stdexcept>

#define assert(expr) if(!bool(expr)) throw std::runtime_error("PDF reading error in " __FILE__ " on line " + tostr(__LINE__));

using namespace std;

PDFParser::PDFParser(const std::string& filename)
  : source(filename.c_str())
{
  // Note; this might be bogus for multi-xref documents: it seems I should
  // only look for startxref at the end, the trailer should follow that xref
  // section.
  ParsePDF::findTrailer(source);
  std::string token;
  assert(source >> token >> std::ws && token == "trailer");
  trailer = ParsePDF::readDictionary(source);
  
  assert(source >> token && token == "startxref");
  int startxref;
  assert(source >> startxref);
  assert(source >> token && token == "%%EOF"); 
  int numxref = int(trailer->get_entry("Size").dyn_cast<PDF::Real>()
                ->get_value());
  
  source.seekg(startxref);
  xrefs = new int[numxref];
  int idx, num;
  assert(source >> token && token == "xref" && source >> idx >> num);
  for(int i = 0; i < num; ++i, ++idx) {
    int offset, generation;
    char status;
    assert(source >> offset >> generation >> status);
    if(status == 'n') assert(generation == 0);
    xrefs[idx] = offset;
  }
}

PDFParser::~PDFParser() {
  delete[] xrefs;
}

PDF::Object::Ptr PDFParser::getObject(PDF::Object::Ptr refobj) {
  PDF::Ref::Ptr ref = refobj.dyn_cast<PDF::Ref>();
  if(!ref) return refobj;
  
  const int num = ref->get_num();
  assert(xrefs[num] != 0);
  source.seekg(xrefs[num]);
  int i, j;
  std::string token;
  assert(source >> i >> j >> token >> std::ws
       && i == num && j == 0 && token == "obj");
  PDF::Object::Ptr result = ParsePDF::readObject(source);
  assert(source >> token >> ws);
  if(token == "endobj")
    return result;

  if(token == "stream") {
    PDF::Dictionary::Ptr dict = result.dyn_cast<PDF::Dictionary>();
    assert(dict /* a stream must be a dictionary */);
    
    PDF::Stream::Ptr stream = PDF::Stream::create();
    // Copy the dictionary
    for(PDF::Dictionary::const_iterator i = dict->begin(); i!=dict->end(); ++i)
      stream->set_entry(i->first, i->second);
    
    // Copy Length bytes from the input
    PDF::Object::Ptr len_obj = stream->get_entry("Length");
    int len = 0;
    if(PDF::Ref::Ptr r = len_obj.dyn_cast<PDF::Ref>()) {
      iostream::pos_type p = source.tellg();
      len_obj = getObject(r);
      source.seekg(p);
    }
    if(PDF::Real::Ptr r = len_obj.dyn_cast<PDF::Real>())
      len = int(r->get_value());
    else
      throw std::runtime_error("Failed to get stream length");
    
    // Todo: This should be a lot more efficient!
    for(int i = 0; i < len; ++i) {
      char ch;
      source.get(ch);
      stream->data() << ch;
    }
    
    return stream;
  }
  throw std::runtime_error("Unexpected token: " + token);
}


namespace {

  PDF::String::Ptr readString(istream& in) {
    assert(in.get() == '(');
    int parenlevel = 0;
    string result;
    while(1) switch(const char ch = in.get()) {
    case '(':
      ++parenlevel;
      result += ch;
      break;
    case ')':
      if(--parenlevel < 0) {
      return PDF::String::create(result);
      }
      result += ch;
      break;
      // Todo: other special characters!
    default:
      result += ch;
    }
  }

  PDF::String::Ptr readHexString(istream& in) {
    assert(in.get() == '<');
    char ch;
    bool even = true;
    std::string result;
    while(in.get(ch)) {
      even = !even;
      if(ch == '>')
      return PDF::String::create(result);
      
      if(ch > 'a') ch -= 'a';
      else if(ch > 'A') ch -= 'A';
      else if(ch > '0') ch -= '0';
      if(!even)
      result += char(ch * 16);
      else
      result[result.length()-1] += ch;
    }
    throw std::runtime_error("Read failed while reading hex string from PDF");
  }

  istream& getstring(istream& in, string& token) {
    const string irregular = "\t\n\f\r ()[]{}<>/%#";
    // Todo: null should also be irregular!
    token.clear();
    char ch;
    while(in.get(ch)) {
      if(irregular.find(ch) == std::string::npos)
      token += ch;

      else {
      in.putback(ch);
      return in;
      }
    }
    return in;
  }
  
  PDF::Name::Ptr readName(istream& in) {
    assert(in.get() == '/');
    const string irregular = "\t\n\f\r ()[]{}<>/%#";
    // Todo: null should also be irregular!
    string name;
    char ch;
    while(in.get(ch)) {
      if(irregular.find(ch) == std::string::npos)
      name += ch;
      
      else if(ch == '#') {
      char h1 = in.get(), h2 = in.get();
      if(h1 > 'a') h1 -= 'a';
      else if(h1 > 'A') h1 -= 'A';
      else if(h1 > '0') h1 -= '0';
      if(h2 > 'a') h1 -= 'a';
      else if(h2 > 'A') h1 -= 'A';
      else if(h2 > '0') h1 -= '0';
      name += char(h1*16 + h2);
      
      } else {
      in.putback(ch);
      return PDF::Name::create(name);
      }
    }
    throw std::runtime_error("Failed to read in pdf name");
  }
  
  PDF::Array::Ptr readArray(istream& in) {
    assert(in.get() == '[');
    std::vector<PDF::Object::Ptr> result;
    while(1) {
      in >> ws;
      switch(in.peek()) {
      case ']': 
      in.get();
      {
        PDF::Array::Ptr array = PDF::Array::create();
        for(std::vector<PDF::Object::Ptr>::const_iterator i = result.begin();
            i != result.end(); ++i)
          array->push_back(*i);
        return array;
      }
      case 'R':
      assert(in.get() == 'R');
      {
        int gen = int(result.back().dyn_cast<PDF::Real>()->get_value());
        result.pop_back();
        int num = int(result.back().dyn_cast<PDF::Real>()->get_value());
        result.pop_back();
        result.push_back(PDF::Ref::create(num, gen));
      }
      break;
      
      default:
      result.push_back(ParsePDF::readObject(in));
      }
    }
  }

  PDF::Real::Ptr readNumber(istream& in) {
    // Todo: create a PDF::Integer if the number is an integer
    float num;
    in >> num;
    return PDF::Real::create(num);
  }
};


void ParsePDF::findTrailer(istream& in) {
  std::streamoff offset = -128; // Start searching last 128 bytes
  while(in.seekg(offset, ios_base::end)) {
    const string target = "trailer";
    string::size_type pos = 0;
    char ch;
    while(in.get(ch)) {
      if(ch == target[pos]) {
      if(++pos >= target.size()) {
        in.seekg(-target.length(), std::ios_base::cur);
        return;
      }
      } else
      pos = 0;
    }
    in.clear();
    offset *= 2;        // Nothing found, search wider
  }
  // seek failed, so we have probably searched the entire file
  throw std::runtime_error("Failed to find PDF trailer");
}


PDF::Object::Ptr ParsePDF::readObject(istream& in) {
  
  switch(in.peek()) {
  case '%': 
    in.ignore('\n');
    return readObject(in);    // the next object after the comment
    
  case '(': return readString(in);
  case '/': return readName(in);
  case '0' ... '9': case '+': case '-': 
    // What about references?
    return readNumber(in);
    
  case '<': {
    in.get();                 // peek one more char ahead
    char ch2 = in.peek();
    in.putback('<');
    if(ch2 == '<')
      return readDictionary(in);
    else
      return readHexString(in);
  }
  case '[':
    return readArray(in);
    
  case 'n': {
    string st;
    if(getstring(in, st) && st == "null")
      return PDF::Object::Ptr();
    else
      throw std::runtime_error
      ("Illegal token \"" + st + "\" at pos " + 
       tostr(in.tellg() - std::streamoff(st.length())) + " in pdf");
  }
  case 't': case 'f': {
    string st;
    assert(getstring(in, st));
    if(st == "true") return PDF::Boolean::create(true);
    if(st == "false") return PDF::Boolean::create(false);
    throw std::runtime_error
      ("Illegal token \"" + st + "\" at pos " + 
       tostr(in.tellg() - std::streamoff(st.length())) + " in pdf");
  }
  default: 
    throw std::runtime_error
      (std::string("Illegal token \"") + std::string(in.get(), 1) + "\" at pos " + 
       tostr(in.tellg() - std::streamoff(1)) + " in pdf");
  }
  
}

PDF::Dictionary::Ptr ParsePDF::readDictionary(istream& in) {
  assert(in.get() == '<' && in.get() == '<');
  typedef std::vector<PDF::Object::Ptr> RVec;
  RVec result;
  
  while(1) {
    in >> ws;
    switch(in.peek()) {
    case '>':
      assert(in.get() == '>' && in.get() == '>');
      {
      // End of the dictionary; return it
      PDF::Dictionary::Ptr dict = PDF::Dictionary::create();
      for(RVec::iterator i = result.begin(); i!=result.end(); ++i) {
        if(PDF::Name::Ptr name = i->dyn_cast<PDF::Name>())
          dict->set_entry(name->get_name(), *(++i));
        else
          throw std::runtime_error
            ("Malformed dictionary: keys must be names");
      }
      return dict;
      }
    case 'R':
      assert(in.get() == 'R');
      {
      int gen = int(result.back().dyn_cast<PDF::Real>()->get_value());
      result.pop_back();
      int num = int(result.back().dyn_cast<PDF::Real>()->get_value());
      result.pop_back();
      result.push_back(PDF::Ref::create(num, gen));
      }
      break;
      
    default:
      result.push_back(ParsePDF::readObject(in));
    }
  }
}

Generated by  Doxygen 1.6.0   Back to index