aboutsummaryrefslogtreecommitdiff
path: root/xpdf/Parser.cc
diff options
context:
space:
mode:
authorCalvin Morrison <calvin@pobox.com>2023-04-05 14:13:39 -0400
committerCalvin Morrison <calvin@pobox.com>2023-04-05 14:13:39 -0400
commit835e373b3eeaabcd0621ed6798ab500f37982fae (patch)
treedfa16b0e2e1b4956b38f693220eac4e607802133 /xpdf/Parser.cc
xpdf-no-select-disableHEADmaster
Diffstat (limited to 'xpdf/Parser.cc')
-rw-r--r--xpdf/Parser.cc316
1 files changed, 316 insertions, 0 deletions
diff --git a/xpdf/Parser.cc b/xpdf/Parser.cc
new file mode 100644
index 0000000..4fad6af
--- /dev/null
+++ b/xpdf/Parser.cc
@@ -0,0 +1,316 @@
+//========================================================================
+//
+// Parser.cc
+//
+// Copyright 1996-2003 Glyph & Cog, LLC
+//
+//========================================================================
+
+#include <aconf.h>
+
+#ifdef USE_GCC_PRAGMAS
+#pragma implementation
+#endif
+
+#include <stddef.h>
+#include <string.h>
+#include "gmempp.h"
+#include "Object.h"
+#include "Array.h"
+#include "Dict.h"
+#include "Decrypt.h"
+#include "Parser.h"
+#include "XRef.h"
+#include "Error.h"
+
+// Max number of nested objects. This is used to catch infinite loops
+// in the object structure.
+#define recursionLimit 500
+
+Parser::Parser(XRef *xrefA, Lexer *lexerA, GBool allowStreamsA) {
+ xref = xrefA;
+ lexer = lexerA;
+ inlineImg = 0;
+ allowStreams = allowStreamsA;
+ lexer->getObj(&buf1);
+ lexer->getObj(&buf2);
+}
+
+Parser::~Parser() {
+ buf1.free();
+ buf2.free();
+ delete lexer;
+}
+
+Object *Parser::getObj(Object *obj, GBool simpleOnly,
+ Guchar *fileKey,
+ CryptAlgorithm encAlgorithm, int keyLength,
+ int objNum, int objGen, int recursion) {
+ char *key;
+ Stream *str;
+ Object obj2;
+ int num;
+ DecryptStream *decrypt;
+ GString *s, *s2;
+ int c;
+
+ // refill buffer after inline image data
+ if (inlineImg == 2) {
+ buf1.free();
+ buf2.free();
+ lexer->getObj(&buf1);
+ lexer->getObj(&buf2);
+ inlineImg = 0;
+ }
+
+ // array
+ if (!simpleOnly && recursion < recursionLimit && buf1.isCmd("[")) {
+ shift();
+ obj->initArray(xref);
+ while (!buf1.isCmd("]") && !buf1.isEOF())
+ obj->arrayAdd(getObj(&obj2, gFalse, fileKey, encAlgorithm, keyLength,
+ objNum, objGen, recursion + 1));
+ if (buf1.isEOF())
+ error(errSyntaxError, getPos(), "End of file inside array");
+ shift();
+
+ // dictionary or stream
+ } else if (!simpleOnly && recursion < recursionLimit && buf1.isCmd("<<")) {
+ shift();
+ obj->initDict(xref);
+ while (!buf1.isCmd(">>") && !buf1.isEOF()) {
+ if (!buf1.isName()) {
+ error(errSyntaxError, getPos(),
+ "Dictionary key must be a name object");
+ shift();
+ } else {
+ key = copyString(buf1.getName());
+ shift();
+ if (buf1.isEOF() || buf1.isError()) {
+ gfree(key);
+ break;
+ }
+ obj->dictAdd(key, getObj(&obj2, gFalse,
+ fileKey, encAlgorithm, keyLength,
+ objNum, objGen, recursion + 1));
+ }
+ }
+ if (buf1.isEOF())
+ error(errSyntaxError, getPos(), "End of file inside dictionary");
+ // stream objects are not allowed inside content streams or
+ // object streams
+ if (allowStreams && buf2.isCmd("stream")) {
+ if ((str = makeStream(obj, fileKey, encAlgorithm, keyLength,
+ objNum, objGen, recursion + 1))) {
+ obj->initStream(str);
+ } else {
+ obj->free();
+ obj->initError();
+ }
+ } else {
+ shift();
+ }
+
+ // indirect reference or integer
+ } else if (buf1.isInt()) {
+ num = buf1.getInt();
+ shift();
+ if (buf1.isInt() && buf2.isCmd("R")) {
+ obj->initRef(num, buf1.getInt());
+ shift();
+ shift();
+ } else {
+ obj->initInt(num);
+ }
+
+ // string
+ } else if (buf1.isString() && fileKey) {
+ s = buf1.getString();
+ s2 = new GString();
+ obj2.initNull();
+ decrypt = new DecryptStream(new MemStream(s->getCString(), 0,
+ s->getLength(), &obj2),
+ fileKey, encAlgorithm, keyLength,
+ objNum, objGen);
+ decrypt->reset();
+ while ((c = decrypt->getChar()) != EOF) {
+ s2->append((char)c);
+ }
+ delete decrypt;
+ obj->initString(s2);
+ shift();
+
+ // simple object
+ } else {
+ buf1.copy(obj);
+ shift();
+ }
+
+ return obj;
+}
+
+Stream *Parser::makeStream(Object *dict, Guchar *fileKey,
+ CryptAlgorithm encAlgorithm, int keyLength,
+ int objNum, int objGen, int recursion) {
+ // get stream start position
+ lexer->skipToNextLine();
+ Stream *curStr = lexer->getStream();
+ if (!curStr) {
+ return NULL;
+ }
+ GFileOffset pos = curStr->getPos();
+
+ GBool haveLength = gFalse;
+ GFileOffset length = 0;
+ GFileOffset endPos;
+
+ // check for length in damaged file
+ if (xref && xref->getStreamEnd(pos, &endPos)) {
+ length = endPos - pos;
+ haveLength = gTrue;
+
+ // get length from the stream object
+ } else {
+ Object obj;
+ dict->dictLookup("Length", &obj, recursion);
+ if (obj.isInt()) {
+ length = (GFileOffset)(Guint)obj.getInt();
+ haveLength = gTrue;
+ } else {
+ error(errSyntaxError, getPos(),
+ "Missing or invalid 'Length' attribute in stream");
+ }
+ obj.free();
+ }
+
+ // in badly damaged PDF files, we can run off the end of the input
+ // stream immediately after the "stream" token
+ if (!lexer->getStream()) {
+ return NULL;
+ }
+
+ // copy the base stream (Lexer will free stream objects when it gets
+ // to end of stream -- which can happen in the shift() calls below)
+ BaseStream *baseStr =
+ (BaseStream *)lexer->getStream()->getBaseStream()->copy();
+
+ // 'Length' attribute is missing -- search for 'endstream'
+ if (!haveLength) {
+ GBool foundEndstream = gFalse;
+ char endstreamBuf[8];
+ if ((curStr = lexer->getStream())) {
+ int c;
+ while ((c = curStr->getChar()) != EOF) {
+ if (c == 'e' &&
+ curStr->getBlock(endstreamBuf, 8) == 8 &&
+ !memcmp(endstreamBuf, "ndstream", 8)) {
+ length = curStr->getPos() - 9 - pos;
+ foundEndstream = gTrue;
+ break;
+ }
+ }
+ }
+ if (!foundEndstream) {
+ error(errSyntaxError, getPos(), "Couldn't find 'endstream' for stream");
+ delete baseStr;
+ return NULL;
+ }
+ }
+
+ // make new base stream
+ Stream *str = baseStr->makeSubStream(pos, gTrue, length, dict);
+
+ // look for the 'endstream' marker
+ if (haveLength) {
+ // skip over stream data
+ lexer->setPos(pos + length);
+
+ // check for 'endstream'
+ // NB: we never reuse the Parser object to parse objects after a
+ // stream, and we could (if the PDF file is damaged) be in the
+ // middle of binary data at this point, so we check the stream
+ // data directly for 'endstream', rather than calling shift() to
+ // parse objects
+ GBool foundEndstream = gFalse;
+ char endstreamBuf[8];
+ if ((curStr = lexer->getStream())) {
+ // skip up to 100 whitespace chars
+ int c;
+ for (int i = 0; i < 100; ++i) {
+ c = curStr->getChar();
+ if (!Lexer::isSpace(c)) {
+ break;
+ }
+ }
+ if (c == 'e') {
+ if (curStr->getBlock(endstreamBuf, 8) == 8 &&
+ !memcmp(endstreamBuf, "ndstream", 8)) {
+ foundEndstream = gTrue;
+ }
+ }
+ }
+ if (!foundEndstream) {
+ error(errSyntaxError, getPos(), "Missing 'endstream'");
+ // kludge for broken PDF files: just add 5k to the length, and
+ // hope it's enough
+ // (dict is now owned by str, so we need to copy it before deleting str)
+ Object obj;
+ dict->copy(&obj);
+ delete str;
+ length += 5000;
+ str = baseStr->makeSubStream(pos, gTrue, length, &obj);
+ }
+ }
+
+ // free the copied base stream
+ delete baseStr;
+
+ // handle decryption
+ if (fileKey) {
+ // the 'Crypt' filter is used to mark unencrypted metadata streams
+ //~ this should also check for an empty DecodeParams entry
+ GBool encrypted = gTrue;
+ Object obj;
+ dict->dictLookup("Filter", &obj, recursion);
+ if (obj.isName("Crypt")) {
+ encrypted = gFalse;
+ } else if (obj.isArray() && obj.arrayGetLength() >= 1) {
+ Object obj2;
+ if (obj.arrayGet(0, &obj2)->isName("Crypt")) {
+ encrypted = gFalse;
+ }
+ obj2.free();
+ }
+ obj.free();
+ if (encrypted) {
+ str = new DecryptStream(str, fileKey, encAlgorithm, keyLength,
+ objNum, objGen);
+ }
+ }
+
+ // get filters
+ str = str->addFilters(dict, recursion);
+
+ return str;
+}
+
+void Parser::shift() {
+ if (inlineImg > 0) {
+ if (inlineImg < 2) {
+ ++inlineImg;
+ } else {
+ // in a damaged content stream, if 'ID' shows up in the middle
+ // of a dictionary, we need to reset
+ inlineImg = 0;
+ }
+ } else if (buf2.isCmd("ID")) {
+ lexer->skipChar(); // skip char after 'ID' command
+ inlineImg = 1;
+ }
+ buf1.free();
+ buf1 = buf2;
+ if (inlineImg > 0) // don't buffer inline image data
+ buf2.initNull();
+ else
+ lexer->getObj(&buf2);
+}