diff options
Diffstat (limited to 'xpdf/HTMLGen.cc')
| -rw-r--r-- | xpdf/HTMLGen.cc | 1120 | 
1 files changed, 1120 insertions, 0 deletions
| diff --git a/xpdf/HTMLGen.cc b/xpdf/HTMLGen.cc new file mode 100644 index 0000000..a077655 --- /dev/null +++ b/xpdf/HTMLGen.cc @@ -0,0 +1,1120 @@ +//======================================================================== +// +// HTMLGen.cc +// +// Copyright 2010-2021 Glyph & Cog, LLC +// +//======================================================================== + +//~ to do: +//~ - fonts +//~   - underlined? (underlines are present in the background image) +//~   - include the original font name in the CSS entry (before the +//~     generic serif/sans-serif/monospace name) +//~ - check that htmlDir exists and is a directory +//~ - links: +//~   - internal links (to pages, to named destinations) +//~   - links from non-text content +//~ - rotated text should go in the background image +//~ - metadata +//~ - PDF outline + +#include <aconf.h> + +#ifdef USE_GCC_PRAGMAS +#pragma implementation +#endif + +#include <stdlib.h> +#include <png.h> +#include "gmem.h" +#include "gmempp.h" +#include "GString.h" +#include "GList.h" +#include "SplashBitmap.h" +#include "PDFDoc.h" +#include "GfxFont.h" +#include "AcroForm.h" +#include "TextOutputDev.h" +#include "SplashOutputDev.h" +#include "ErrorCodes.h" +#include "WebFont.h" +#include "HTMLGen.h" + +#ifdef _WIN32 +#  define strcasecmp stricmp +#  define strncasecmp strnicmp +#endif + +//------------------------------------------------------------------------ + +struct FontStyleTagInfo { +  const char *tag; +  int tagLen; +  GBool bold; +  GBool italic; +}; + +// NB: these are compared, in order, against the tail of the font +// name, so "BoldItalic" must come before "Italic", etc. +static FontStyleTagInfo fontStyleTags[] = { +  {"Roman",                    5, gFalse, gFalse}, +  {"Regular",                  7, gFalse, gFalse}, +  {"Condensed",                9, gFalse, gFalse}, +  {"CondensedBold",           13, gTrue,  gFalse}, +  {"CondensedLight",          14, gFalse, gFalse}, +  {"SemiBold",                 8, gTrue,  gFalse}, +  {"BoldItalicMT",            12, gTrue,  gTrue}, +  {"BoldItalic",              10, gTrue,  gTrue}, +  {"Bold_Italic",             11, gTrue,  gTrue}, +  {"BoldOblique",             11, gTrue,  gTrue}, +  {"Bold_Oblique",            12, gTrue,  gTrue}, +  {"BoldMT",                   6, gTrue,  gFalse}, +  {"Bold",                     4, gTrue,  gFalse}, +  {"ItalicMT",                 8, gFalse, gTrue}, +  {"Italic",                   6, gFalse, gTrue}, +  {"Oblique",                  7, gFalse, gTrue}, +  {"Light",                    5, gFalse, gFalse}, +  {NULL,                       0, gFalse, gFalse} +}; + +struct StandardFontInfo { +  const char *name; +  GBool fixedWidth; +  GBool serif; +}; + +static StandardFontInfo standardFonts[] = { +  {"Arial",                    gFalse, gFalse}, +  {"Courier",                  gTrue,  gFalse}, +  {"Futura",                   gFalse, gFalse}, +  {"Helvetica",                gFalse, gFalse}, +  {"Minion",                   gFalse, gTrue}, +  {"NewCenturySchlbk",         gFalse, gTrue}, +  {"Times",                    gFalse, gTrue}, +  {"TimesNew",                 gFalse, gTrue}, +  {"Times_New",                gFalse, gTrue}, +  {"Verdana",                  gFalse, gFalse}, +  {"LucidaSans",               gFalse, gFalse}, +  {NULL,                       gFalse, gFalse} +}; + +struct SubstFontInfo { +  double mWidth; +}; + +// index: {fixed:8, serif:4, sans-serif:0} + bold*2 + italic +static SubstFontInfo substFonts[16] = { +  {0.833}, +  {0.833}, +  {0.889}, +  {0.889}, +  {0.788}, +  {0.722}, +  {0.833}, +  {0.778}, +  {0.600}, +  {0.600}, +  {0.600}, +  {0.600} +}; + +// Map Unicode indexes from the private use area, following the Adobe +// Glyph list. +#define privateUnicodeMapStart 0xf6f9 +#define privateUnicodeMapEnd   0xf7ff +static int +privateUnicodeMap[privateUnicodeMapEnd - privateUnicodeMapStart + 1] = { +  0x0141, 0x0152, 0,      0,      0x0160, 0,      0x017d,         // f6f9 +  0,      0,      0,      0,      0,      0,      0,      0,      // f700 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0,      0,      0,      0,      0,      0,      0,      // f710 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0x0021, 0,      0,      0x0024, 0,      0x0026, 0,      // f720 +  0,      0,      0,      0,      0,      0,      0,      0, +  0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // f730 +  0x0038, 0x0039, 0,      0,      0,      0,      0,      0x003f, +  0,      0,      0,      0,      0,      0,      0,      0,      // f740 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0,      0,      0,      0,      0,      0,      0,      // f750 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // f760 +  0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, +  0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // f770 +  0x0058, 0x0059, 0x005a, 0,      0,      0,      0,      0, +  0,      0,      0,      0,      0,      0,      0,      0,      // f780 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0,      0,      0,      0,      0,      0,      0,      // f790 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0x00a1, 0x00a2, 0,      0,      0,      0,      0,      // f7a0 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0,      0,      0,      0,      0,      0,      0,      // f7b0 +  0,      0,      0,      0,      0,      0,      0,      0x00bf, +  0,      0,      0,      0,      0,      0,      0,      0,      // f7c0 +  0,      0,      0,      0,      0,      0,      0,      0, +  0,      0,      0,      0,      0,      0,      0,      0,      // f7d0 +  0,      0,      0,      0,      0,      0,      0,      0, +  0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // f7e0 +  0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, +  0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0,      // f7f0 +  0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178 +}; + +enum VerticalAlignment { +  vertAlignBaseline, +  vertAlignSub, +  vertAlignSuper, +  vertAlignTop +}; + +static const char *vertAlignNames[] = { +  "baseline", +  "sub", +  "super", +  "top" +}; + +//------------------------------------------------------------------------ + +class HTMLGenFontDefn { +public: + +  HTMLGenFontDefn(Ref fontIDA, GString *fontFaceA, GString *fontSpecA, +		  double scaleA) +    : fontID(fontIDA), fontFace(fontFaceA), fontSpec(fontSpecA) +    , scale(scaleA), used(gFalse) {} +  ~HTMLGenFontDefn() { delete fontFace; delete fontSpec; } +  GBool match(Ref fontIDA) +    { return fontIDA.num == fontID.num && fontIDA.gen == fontID.gen; } + +  Ref fontID; +  GString *fontFace;		// NULL for substituted fonts +  GString *fontSpec; +  double scale; +  GBool used;			// set when used (per page) +}; + +//------------------------------------------------------------------------ + +class HTMLGenFormFieldInfo { +public: + +  HTMLGenFormFieldInfo(AcroFormField *acroFormFieldA) +    : acroFormField(acroFormFieldA) {} + +  AcroFormField *acroFormField; +}; + +//------------------------------------------------------------------------ + +class Base64Encoder { +public: + +  Base64Encoder(int (*writeFuncA)(void *stream, const char *data, int size), +		void *streamA); +  void encode(const unsigned char *data, size_t size); +  void flush(); + +private: + +  int (*writeFunc)(void *stream, const char *data, int size); +  void *stream; +  unsigned char buf[3]; +  int bufLen; +}; + +static char base64Chars[65] = +    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +Base64Encoder::Base64Encoder(int (*writeFuncA)(void *stream, const char *data, +					       int size), +			     void *streamA) { +  writeFunc = writeFuncA; +  stream = streamA; +  bufLen = 0; +} + +void Base64Encoder::encode(const unsigned char *data, size_t size) { +  size_t i = 0; +  while (1) { +    while (bufLen < 3) { +      if (i >= size) { +	return; +      } +      buf[bufLen++] = data[i++]; +    } +    char out[4]; +    out[0] = base64Chars[(buf[0] >> 2) & 0x3f]; +    out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f]; +    out[2] = base64Chars[((buf[1] << 2) | (buf[2] >> 6)) & 0x3f]; +    out[3] = base64Chars[buf[2] & 0x3f]; +    writeFunc(stream, out, 4); +    bufLen = 0; +  } +} + +void Base64Encoder::flush() { +  // if bufLen == 0, this does nothing +  // bufLen should never be 3 here +  char out[4]; +  if (bufLen == 1) { +    out[0] = base64Chars[(buf[0] >> 2) & 0x3f]; +    out[1] = base64Chars[(buf[0] << 4) & 0x3f]; +    out[2] = '='; +    out[3] = '='; +    writeFunc(stream, out, 4); +  } else if (bufLen == 2) { +    out[0] = base64Chars[(buf[0] >> 2) & 0x3f]; +    out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f]; +    out[2] = base64Chars[(buf[1] << 2) & 0x3f]; +    out[3] = '='; +    writeFunc(stream, out, 4); +  } +} + +static int writeToString(void *stream, const char *data, int size) { +  ((GString *)stream)->append(data, size); +  return size; +} + +//------------------------------------------------------------------------ + + +//------------------------------------------------------------------------ + +HTMLGen::HTMLGen(double backgroundResolutionA, GBool tableMode) { +  TextOutputControl textOutControl; +  SplashColor paperColor; + +  ok = gTrue; + +  backgroundResolution = backgroundResolutionA; +  zoom = 1.0; +  vStretch = 1.0; +  drawInvisibleText = gTrue; +  allTextInvisible = gFalse; +  extractFontFiles = gFalse; +  convertFormFields = gFalse; +  embedBackgroundImage = gFalse; +  embedFonts = gFalse; + +  // set up the TextOutputDev +  textOutControl.mode = tableMode ? textOutTableLayout : textOutReadingOrder; +  textOutControl.html = gTrue; +  textOutControl.splitRotatedWords = gTrue; +  textOut = new TextOutputDev(NULL, &textOutControl, gFalse); +  if (!textOut->isOk()) { +    ok = gFalse; +  } + +  // set up the SplashOutputDev +  paperColor[0] = paperColor[1] = paperColor[2] = 0xff; +  splashOut = new SplashOutputDev(splashModeRGB8, 1, gFalse, paperColor); + +  fontDefns = NULL; +} + +HTMLGen::~HTMLGen() { +  delete textOut; +  delete splashOut; +  if (fontDefns) { +    deleteGList(fontDefns, HTMLGenFontDefn); +  } +} + +void HTMLGen::startDoc(PDFDoc *docA) { +  doc = docA; +  splashOut->startDoc(doc->getXRef()); + +  if (fontDefns) { +    deleteGList(fontDefns, HTMLGenFontDefn); +  } +  fontDefns = new GList(); +  nextFontFaceIdx = 0; +} + +static inline int pr(int (*writeFunc)(void *stream, const char *data, int size), +		     void *stream, const char *data) { +  return writeFunc(stream, data, (int)strlen(data)); +} + +static int pf(int (*writeFunc)(void *stream, const char *data, int size), +	      void *stream, const char *fmt, ...) { +  va_list args; +  GString *s; +  int ret; + +  va_start(args, fmt); +  s = GString::formatv(fmt, args); +  va_end(args); +  ret = writeFunc(stream, s->getCString(), s->getLength()); +  delete s; +  return ret; +} + +struct PNGWriteInfo { +  Base64Encoder *base64; +  int (*writePNG)(void *stream, const char *data, int size); +  void *pngStream; +}; + +static void pngWriteFunc(png_structp png, png_bytep data, png_size_t size) { +  PNGWriteInfo *info = (PNGWriteInfo *)png_get_progressive_ptr(png); +  if (info->base64) { +    info->base64->encode(data, size); +  } else { +    info->writePNG(info->pngStream, (char *)data, (int)size); +  } +} + +int HTMLGen::convertPage( +		 int pg, const char *pngURL, const char *htmlDir, +		 int (*writeHTML)(void *stream, const char *data, int size), +		 void *htmlStream, +		 int (*writePNG)(void *stream, const char *data, int size), +		 void *pngStream) { +  png_structp png; +  png_infop pngInfo; +  PNGWriteInfo writeInfo; +  SplashBitmap *bitmap; +  Guchar *p; +  double pageW, pageH; +  TextPage *text; +  GList *cols, *pars, *lines, *words; +  TextFontInfo *font; +  TextColumn *col; +  TextParagraph *par; +  TextLine *line; +  HTMLGenFontDefn *fontDefn; +  GString *s; +  double base; +  int primaryDir, spanDir; +  int colIdx, parIdx, lineIdx, firstWordIdx, lastWordIdx; +  int y, i; + +  // generate the background bitmap +  splashOut->setSkipText(!allTextInvisible, gFalse); +  doc->displayPage(splashOut, pg, +		   backgroundResolution, backgroundResolution * vStretch, +		   0, gFalse, gTrue, gFalse); +  bitmap = splashOut->getBitmap(); + +  // page size +  if (doc->getPageRotate(pg) == 90 || doc->getPageRotate(pg) == 270) { +    pageW = doc->getPageCropHeight(pg); +    pageH = doc->getPageCropWidth(pg); +  } else { +    pageW = doc->getPageCropWidth(pg); +    pageH = doc->getPageCropHeight(pg); +  } + +  // get the PDF text +  doc->displayPage(textOut, pg, 72, 72, 0, gFalse, gTrue, gFalse); +  doc->processLinks(textOut, pg); +  text = textOut->takeText(); +  primaryDir = text->primaryDirectionIsLR() ? 1 : -1; + +  // insert a special character for each form field; +  // remove existing characters inside field bboxes; +  // erase background content inside field bboxes +  formFieldFont = NULL; +  formFieldInfo = NULL; +  if (convertFormFields) { +    AcroForm *form = doc->getCatalog()->getForm(); +    if (form) { +      formFieldInfo = new GList(); +      formFieldFont = new TextFontInfo(); +      double yTop = doc->getCatalog()->getPage(pg)->getMediaBox()->y2; +      for (i = 0; i < form->getNumFields(); ++i) { +	AcroFormField *field = form->getField(i); +	AcroFormFieldType fieldType = field->getAcroFormFieldType(); +	if (field->getPageNum() == pg && +	    (fieldType == acroFormFieldText || +	     fieldType == acroFormFieldCheckbox)) { +	  double llx, lly, urx, ury; +	  field->getBBox(&llx, &lly, &urx, &ury); +	  lly = yTop - lly; +	  ury = yTop - ury; + +	  // add the field info +	  int fieldIdx = formFieldInfo->getLength(); +	  formFieldInfo->append(new HTMLGenFormFieldInfo(field)); + +	  // remove exsting chars +	  text->removeChars(llx, ury, urx, lly, 0.75, 0.5); +	 +	  // erase background content +	  int llxI = (int)(llx * backgroundResolution / 72 + 0.5); +	  int llyI = (int)(lly * backgroundResolution * vStretch / 72 + 0.5); +	  int urxI = (int)(urx * backgroundResolution / 72 + 0.5); +	  int uryI = (int)(ury * backgroundResolution * vStretch / 72 + 0.5); +	  llyI += (int)(backgroundResolution * vStretch / 20); +	  if (llxI < 0) { +	    llxI = 0; +	  } +	  if (urxI >= bitmap->getWidth()) { +	    urxI = bitmap->getWidth() - 1; +	  } +	  if (uryI < 0) { +	    uryI = 0; +	  } +	  if (llyI > bitmap->getHeight()) { +	    llyI = bitmap->getHeight() - 1; +	  } +	  if (uryI <= llyI && llxI <= urxI) { +	    SplashColorPtr p = bitmap->getDataPtr() +	                         + uryI * bitmap->getRowSize() + llxI * 3; +	    for (int y = uryI; y <= llyI; ++y) { +	      memset(p, 0xff, (urxI - llxI + 1) * 3); +	      p += bitmap->getRowSize(); +	    } +	  } + +	  // add a special char +	  // (the font size is unused -- 10 is an arbitrary value) +	  text->addSpecialChar(llx, ury, urx, lly, +			       0, formFieldFont, 10, 0x80000000 + fieldIdx); +	} +      } +    } +  } + +  // HTML header +  pr(writeHTML, htmlStream, "<html>\n"); +  pr(writeHTML, htmlStream, "<head>\n"); +  pr(writeHTML, htmlStream, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"); +  pr(writeHTML, htmlStream, "<style type=\"text/css\">\n"); +  pr(writeHTML, htmlStream, ".txt { white-space:nowrap; }\n"); +  if (convertFormFields) { +    pr(writeHTML, htmlStream, ".textfield {\n"); +    pr(writeHTML, htmlStream, "  border: 0;\n"); +    pr(writeHTML, htmlStream, "  padding: 0;\n"); +    pr(writeHTML, htmlStream, "  background: #ccccff;\n"); +    pr(writeHTML, htmlStream, "}\n"); +    pr(writeHTML, htmlStream, ".checkbox {\n"); +    pr(writeHTML, htmlStream, "}\n"); +  } +  fonts = text->getFonts(); +  fontScales = (double *)gmallocn(fonts->getLength(), sizeof(double)); +  for (i = 0; i < fontDefns->getLength(); ++i) { +    fontDefn = (HTMLGenFontDefn *)fontDefns->get(i); +    fontDefn->used = gFalse; +  } +  for (i = 0; i < fonts->getLength(); ++i) { +    font = (TextFontInfo *)fonts->get(i); +    fontDefn = getFontDefn(font, htmlDir); +    if (!fontDefn->used && fontDefn->fontFace) { +      pr(writeHTML, htmlStream, fontDefn->fontFace->getCString()); +    } +    pf(writeHTML, htmlStream, ".f{0:d} {{ {1:t} }}\n", i, fontDefn->fontSpec); +    fontScales[i] = fontDefn->scale; +    fontDefn->used = gTrue; +  } +  pr(writeHTML, htmlStream, "</style>\n"); +  pr(writeHTML, htmlStream, "</head>\n"); +  if (primaryDir >= 0) { +    pr(writeHTML, htmlStream, "<body>\n"); +  } else { +    pr(writeHTML, htmlStream, "<body dir=\"rtl\">\n"); +  } + +  // background image element (part 1) +  if (primaryDir >= 0) { +    pf(writeHTML, htmlStream, "<img style=\"position:absolute; left:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ", +       (int)(pageW * zoom), (int)(pageH * zoom * vStretch)); +  } else { +    pf(writeHTML, htmlStream, "<img style=\"position:absolute; right:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ", +       (int)(pageW * zoom), (int)(pageH * zoom * vStretch)); +  } +  if (embedBackgroundImage) { +    pr(writeHTML, htmlStream, "src=\"data:image/png;base64,\n"); +    writeInfo.base64 = new Base64Encoder(writeHTML, htmlStream);  +    writeInfo.writePNG = NULL; +    writeInfo.pngStream = NULL; +  } else { +    pf(writeHTML, htmlStream, "src=\"{0:s}\"", pngURL); +    writeInfo.base64 = NULL; +    writeInfo.writePNG = writePNG; +    writeInfo.pngStream = pngStream; +  } + +  // background image data - writing to a separate file, or embedding +  // with base64 encoding +  if (!(png = png_create_write_struct(PNG_LIBPNG_VER_STRING, +				      NULL, NULL, NULL)) || +      !(pngInfo = png_create_info_struct(png))) { +    return errFileIO; +  } +  if (setjmp(png_jmpbuf(png))) { +    return errFileIO; +  } +  png_set_write_fn(png, &writeInfo, pngWriteFunc, NULL); +  png_set_IHDR(png, pngInfo, bitmap->getWidth(), bitmap->getHeight(), +	       8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, +	       PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT); +  png_write_info(png, pngInfo); +  p = bitmap->getDataPtr(); +  for (y = 0; y < bitmap->getHeight(); ++y) { +    png_write_row(png, (png_bytep)p); +    p += bitmap->getRowSize(); +  } +  png_write_end(png, pngInfo); +  png_destroy_write_struct(&png, &pngInfo); +  if (embedBackgroundImage) { +    writeInfo.base64->flush(); +    delete writeInfo.base64; +  } + +  // background image element (part 2) +  pr(writeHTML, htmlStream, "\">\n"); + +  // generate the HTML text +  nextFieldID = 0; +  cols = text->makeColumns(); +  for (colIdx = 0; colIdx < cols->getLength(); ++colIdx) { +    col = (TextColumn *)cols->get(colIdx); +    pars = col->getParagraphs(); +    for (parIdx = 0; parIdx < pars->getLength(); ++parIdx) { +      par = (TextParagraph *)pars->get(parIdx); +      lines = par->getLines(); +      for (lineIdx = 0; lineIdx < lines->getLength(); ++lineIdx) { +	line = (TextLine *)lines->get(lineIdx); +	if (line->getRotation() != 0) { +	  continue; +	} +	words = line->getWords(); +	if (lineIdx == 0 && par->hasDropCap() && words->getLength() >= 2) { +	  base = ((TextWord *)words->get(1))->getBaseline(); +	} else { +	  base = line->getBaseline(); +	} +	s = new GString(); +	for (firstWordIdx = (primaryDir >= 0) ? 0 : words->getLength() - 1; +	     (primaryDir >= 0) ? firstWordIdx < words->getLength() +	                       : firstWordIdx >= 0; +	     firstWordIdx = lastWordIdx + primaryDir) { +	  lastWordIdx = findDirSpan(words, firstWordIdx, +				    primaryDir, &spanDir); +	  appendSpans(words, firstWordIdx, lastWordIdx, +		      primaryDir, spanDir, +		      base, lineIdx == 0 && par->hasDropCap(), +		      s); +	} +	if (primaryDir >= 0) { +	  pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; left:{0:d}px; top:{1:d}px;\">{2:t}</div>\n", +	     (int)(line->getXMin() * zoom), +	     (int)(line->getYMin() * zoom * vStretch), s); +	} else { +	  pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; right:{0:d}px; top:{1:d}px;\">{2:t}</div>\n", +	     (int)((pageW - line->getXMax()) * zoom), +	     (int)(line->getYMin() * zoom * vStretch), s); +	} +	delete s; +      } +    } +  } +  gfree(fontScales); +  delete text; +  deleteGList(cols, TextColumn); +  if (formFieldFont) { +    delete formFieldFont; +    formFieldFont = NULL; +  } +  if (formFieldInfo) { +    deleteGList(formFieldInfo, HTMLGenFormFieldInfo); +    formFieldInfo = NULL; +  } + +  // HTML trailer +  pr(writeHTML, htmlStream, "</body>\n"); +  pr(writeHTML, htmlStream, "</html>\n"); + +  return errNone; +} + +// Find a sequence of words, starting at <firstWordIdx>, that have the +// same writing direction.  Returns the index of the last word, and +// sets *<spanDir> to the span direction. +int HTMLGen::findDirSpan(GList *words, int firstWordIdx, int primaryDir, +			 int *spanDir) { +  int dir0, dir1, nextWordIdx; + +  dir0 = ((TextWord *)words->get(firstWordIdx))->getDirection(); +  for (nextWordIdx = firstWordIdx + primaryDir; +       (primaryDir >= 0) ? nextWordIdx < words->getLength() +	                 : nextWordIdx >= 0; +       nextWordIdx += primaryDir) { +    dir1 = ((TextWord *)words->get(nextWordIdx))->getDirection(); +    if (dir0 == 0) { +      dir0 = dir1; +    } else if (dir1 != 0 && dir1 != dir0) { +      break; +    } +  } + +  if (dir0 == 0) { +    *spanDir = primaryDir; +  } else { +    *spanDir = dir0; +  } + +  return nextWordIdx - primaryDir; +} + +// Create HTML spans for words <firstWordIdx> .. <lastWordIdx>, and +// append them to <s>. +void HTMLGen::appendSpans(GList *words, int firstWordIdx, int lastWordIdx, +			  int primaryDir, int spanDir, +			  double base, GBool dropCapLine, GString *s) { +  if (allTextInvisible && !drawInvisibleText) { +    return; +  } + +  if (spanDir != primaryDir) { +    int t = firstWordIdx; +    firstWordIdx = lastWordIdx; +    lastWordIdx = t; +  } + +  int wordIdx = firstWordIdx; +  while ((spanDir >= 0) ? wordIdx <= lastWordIdx +	                : wordIdx >= lastWordIdx) { +    TextWord *word0 = (TextWord *)words->get(wordIdx); + +    // form field(s): generate <input> element(s) +    if (convertFormFields && word0->getFontInfo() == formFieldFont) { +      for (int i = (spanDir >= 0) ? 0 : word0->getLength() - 1; +	   (spanDir >= 0) ? i < word0->getLength() : i >= 0; +	   i += spanDir) { +	int fieldIdx = word0->getChar(0) - 0x80000000; +	if (fieldIdx >= 0 && fieldIdx < formFieldInfo->getLength()) { +	  HTMLGenFormFieldInfo *ffi = +	      (HTMLGenFormFieldInfo *)formFieldInfo->get(fieldIdx); +	  AcroFormField *field = ffi->acroFormField; +	  AcroFormFieldType fieldType = field->getAcroFormFieldType(); +	  double llx, lly, urx, ury; +	  field->getBBox(&llx, &lly, &urx, &ury); +	  int width = (int)(urx - llx); +	  Ref fontID; +	  double fontSize; +	  field->getFont(&fontID, &fontSize); +	  if (fontSize == 0) { +	    fontSize = 12; +	  } +	  if (fieldType == acroFormFieldText) { +	    s->appendf("<input type=\"text\" class=\"textfield\" id=\"textfield{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5)); +	    ++nextFieldID; +	  } else if (fieldType == acroFormFieldCheckbox) { +	    s->appendf("<input type=\"checkbox\" class=\"checkbox\" id=\"checkbox{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5)); +	    ++nextFieldID; +	  } +	} +      } + +      if (word0->getSpaceAfter()) { +	s->append(' '); +      } + +      wordIdx += spanDir; + +    // skip invisible words +    } else if (!drawInvisibleText && +	       (word0->isInvisible() || word0->isRotated())) { +      wordIdx += spanDir; + +    // generate a <span> containing one or more words +    } else { + +      double r0 = 0, g0 = 0, b0 = 0; // make gcc happy +      VerticalAlignment vertAlign0 = vertAlignBaseline; // make gcc happy +      GString *linkURI0 = NULL; + +      GBool invisible = word0->isInvisible() || word0->isRotated(); + +      do { +	TextWord *word1 = (TextWord *)words->get(wordIdx); + +	// get word parameters +	double r1, g1, b1; +	word0->getColor(&r1, &g1, &b1); +	double base1 = word1->getBaseline(); +	VerticalAlignment vertAlign1; +	if (dropCapLine) { +	  //~ this will fail if there are subscripts or superscripts in +	  //~   the first line of a paragraph with a drop cap +	  vertAlign1 = vertAlignTop; +	} else if (base1 - base < -1) { +	  vertAlign1 = vertAlignSuper; +	} else if (base1 - base > 1) { +	  vertAlign1 = vertAlignSub; +	} else { +	  vertAlign1 = vertAlignBaseline; +	} +	GString *linkURI1 = word1->getLinkURI(); + +	// start of span +	if (word1 == word0) { +	  r0 = r1; +	  g0 = g1; +	  b0 = b1; +	  vertAlign0 = vertAlign1; +	  linkURI0 = linkURI1; + +	  int i; +	  for (i = 0; i < fonts->getLength(); ++i) { +	    if (word1->getFontInfo() == (TextFontInfo *)fonts->get(i)) { +	      break; +	    } +	  } +	  if (linkURI1) { +	    s->appendf("<a href=\"{0:t}\">", linkURI0); +	  } +	  // we force spans to be LTR or RTL; this is a kludge, but it's +	  // far easier than implementing the full Unicode bidi algorithm +	  const char *dirTag; +	  if (spanDir == primaryDir) { +	    dirTag = ""; +	  } else if (spanDir < 0) { +	    dirTag = " dir=\"rtl\""; +	  } else { +	    dirTag = " dir=\"ltr\""; +	  } +	  s->appendf("<span class=\"f{0:d}\"{1:s} style=\"font-size:{2:d}px;vertical-align:{3:s};{4:s}color:rgba({5:d},{6:d},{7:d},{8:d});\">", +		     i, +		     dirTag, +		     (int)(fontScales[i] * word1->getFontSize() * zoom), +		     vertAlignNames[vertAlign1], +		     (dropCapLine && wordIdx == 0) ? "line-height:75%;" : "", +		     (int)(r0 * 255), (int)(g0 * 255), (int)(b0 * 255), +		     invisible ? 0 : 1); + +	// end of span +	} else if (word1->getFontInfo() != word0->getFontInfo() || +		   word1->getFontSize() != word0->getFontSize() || +		   word1->isInvisible() != word0->isInvisible() || +		   word1->isRotated() != word0->isRotated() || +		   vertAlign1 != vertAlign0 || +		   r1 != r0 || g1 != g0 || b1 != b0 || +		   linkURI1 != linkURI0) { +	  break; +	} + +	// add a space before the word, if needed +	// -- this only happens with the first word in a reverse section +	if (spanDir != primaryDir && wordIdx == firstWordIdx) { +	  GBool sp; +	  if (spanDir >= 0) { +	    if (wordIdx > 0) { +	      sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter(); +	    } else { +	      sp = gFalse; +	    } +	  } else { +	    sp = word1->getSpaceAfter(); +	  } +	  if (sp) { +	    s->append(' '); +	  } +	} + +	// generate the word text +	for (int i = (spanDir >= 0) ? 0 : word1->getLength() - 1; +	     (spanDir >= 0) ? i < word1->getLength() : i >= 0; +	     i += spanDir) { +	  Unicode u = word1->getChar(i); +	  if (u >= privateUnicodeMapStart && +	      u <= privateUnicodeMapEnd && +	      privateUnicodeMap[u - privateUnicodeMapStart]) { +	    u = privateUnicodeMap[u - privateUnicodeMapStart]; +	  } +	  appendUTF8(u, s); +	} + +	// add a space after the word, if needed +	// -- there is never a space after the last word in a reverse +	//    section (this will be handled as a space after the last +	//    word in the previous primary-direction section) +	GBool sp; +	if (spanDir != primaryDir && wordIdx == lastWordIdx) { +	  sp = gFalse; +	} else if (spanDir >= 0) { +	  sp = word1->getSpaceAfter(); +	} else { +	  if (wordIdx > 0) { +	    sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter(); +	  } else { +	    sp = gFalse; +	  } +	} +	if (sp) { +	  s->append(' '); +	} + +	wordIdx += spanDir; +      } while ((spanDir >= 0) ? wordIdx <= lastWordIdx +	                      : wordIdx >= lastWordIdx); + +      s->append("</span>"); +      if (linkURI0) { +	s->append("</a>"); +      } +    } +  } +} + +void HTMLGen::appendUTF8(Unicode u, GString *s) { +  if (u <= 0x7f) { +    if (u == '&') { +      s->append("&"); +    } else if (u == '<') { +      s->append("<"); +    } else if (u == '>') { +      s->append(">"); +    } else { +      s->append((char)u); +    } +  } else if (u <= 0x7ff) { +    s->append((char)(0xc0 + (u >> 6))); +    s->append((char)(0x80 + (u & 0x3f))); +  } else if (u <= 0xffff) { +    s->append((char)(0xe0 + (u >> 12))); +    s->append((char)(0x80 + ((u >> 6) & 0x3f))); +    s->append((char)(0x80 + (u & 0x3f))); +  } else if (u <= 0x1fffff) { +    s->append((char)(0xf0 + (u >> 18))); +    s->append((char)(0x80 + ((u >> 12) & 0x3f))); +    s->append((char)(0x80 + ((u >> 6) & 0x3f))); +    s->append((char)(0x80 + (u & 0x3f))); +  } else if (u <= 0x3ffffff) { +    s->append((char)(0xf8 + (u >> 24))); +    s->append((char)(0x80 + ((u >> 18) & 0x3f))); +    s->append((char)(0x80 + ((u >> 12) & 0x3f))); +    s->append((char)(0x80 + ((u >> 6) & 0x3f))); +    s->append((char)(0x80 + (u & 0x3f))); +  } else if (u <= 0x7fffffff) { +    s->append((char)(0xfc + (u >> 30))); +    s->append((char)(0x80 + ((u >> 24) & 0x3f))); +    s->append((char)(0x80 + ((u >> 18) & 0x3f))); +    s->append((char)(0x80 + ((u >> 12) & 0x3f))); +    s->append((char)(0x80 + ((u >> 6) & 0x3f))); +    s->append((char)(0x80 + (u & 0x3f))); +  } +} + +HTMLGenFontDefn *HTMLGen::getFontDefn(TextFontInfo *font, +				      const char *htmlDir) { +  Ref id; +  HTMLGenFontDefn *fontDefn; +  int i; + +  // check the existing font defns +  id = font->getFontID(); +  if (id.num >= 0) { +    for (i = 0; i < fontDefns->getLength(); ++i) { +      fontDefn = (HTMLGenFontDefn *)fontDefns->get(i); +      if (fontDefn->match(id)) { +	return fontDefn; +      } +    } +  } + +  // try to extract a font file +  if (!extractFontFiles || +      !(fontDefn = getFontFile(font, htmlDir))) { + +    // get a substitute font +    fontDefn = getSubstituteFont(font); +  } + +  fontDefns->append(fontDefn); +  return fontDefn; +} + +HTMLGenFontDefn *HTMLGen::getFontFile(TextFontInfo *font, +				      const char *htmlDir) { +  Ref id; +  HTMLGenFontDefn *fontDefn; +  Object fontObj; +  GfxFont *gfxFont; +  WebFont *webFont; +  GString *fontFile, *fontPath, *fontFace, *fontSpec; +  const char *family, *weight, *style; +  double scale; + +  id = font->getFontID(); +  if (id.num < 0) { +    return NULL; +  } + +  doc->getXRef()->fetch(id.num, id.gen, &fontObj); +  if (!fontObj.isDict()) { +    fontObj.free(); +    return NULL; +  } + +  gfxFont = GfxFont::makeFont(doc->getXRef(), "F", id, fontObj.getDict()); +  webFont = new WebFont(gfxFont, doc->getXRef()); +  fontDefn = NULL; +  fontFace = NULL; + +  if (webFont->canWriteTTF()) { +    if (embedFonts) { +      GString *ttfData = webFont->getTTFData(); +      if (ttfData) { +	fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/ttf;base64,", +				   nextFontFaceIdx); +	Base64Encoder enc(writeToString, fontFace); +	enc.encode((unsigned char *)ttfData->getCString(), +		   (size_t)ttfData->getLength()); +	enc.flush(); +	fontFace->append("\"); }\n"); +	delete ttfData; +      } +    } else { +      fontFile = GString::format("{0:d}.ttf", nextFontFaceIdx); +      fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile); +      if (webFont->writeTTF(fontPath->getCString())) { +	fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n", +				   nextFontFaceIdx, fontFile); +      } +      delete fontPath; +      delete fontFile; +    } +    if (fontFace) { +      getFontDetails(font, &family, &weight, &style, &scale); +      fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};", +				 nextFontFaceIdx, family, weight, style); +      ++nextFontFaceIdx; +      fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0); +    } + +  } else if (webFont->canWriteOTF()) { +    if (embedFonts) { +      GString *otfData = webFont->getOTFData(); +      if (otfData) { +	fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/otf;base64,", +				   nextFontFaceIdx); +	Base64Encoder enc(writeToString, fontFace); +	enc.encode((unsigned char *)otfData->getCString(), +		   (size_t)otfData->getLength()); +	enc.flush(); +	fontFace->append("\"); }\n"); +	delete otfData; +      } +    } else { +      fontFile = GString::format("{0:d}.otf", nextFontFaceIdx); +      fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile); +      if (webFont->writeOTF(fontPath->getCString())) { +	fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n", +				   nextFontFaceIdx, fontFile); +      } +      delete fontPath; +      delete fontFile; +    } +    if (fontFace) { +      getFontDetails(font, &family, &weight, &style, &scale); +      fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};", +				 nextFontFaceIdx, family, weight, style); +      ++nextFontFaceIdx; +      fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0); +    } +  } + +  delete webFont; +  delete gfxFont; +  fontObj.free(); + +  return fontDefn; +} + +HTMLGenFontDefn *HTMLGen::getSubstituteFont(TextFontInfo *font) { +  const char *family, *weight, *style; +  double scale; +  GString *fontSpec; + +  getFontDetails(font, &family, &weight, &style, &scale); +  fontSpec = GString::format("font-family:{0:s}; font-weight:{1:s}; font-style:{2:s};", +			     family, weight, style); +  return new HTMLGenFontDefn(font->getFontID(), NULL, fontSpec, scale); +} + +void HTMLGen::getFontDetails(TextFontInfo *font, const char **family, +			     const char **weight, const char **style, +			     double *scale) { +  GString *fontName; +  char *fontName2; +  FontStyleTagInfo *fst; +  StandardFontInfo *sf; +  GBool fixedWidth, serif, bold, italic; +  double s; +  int n, i; + +  // get the font name, remove any subset tag +  fontName = font->getFontName(); +  if (fontName) { +    fontName2 = fontName->getCString(); +    n = fontName->getLength(); +    for (i = 0; i < n && i < 7; ++i) { +      if (fontName2[i] < 'A' || fontName2[i] > 'Z') { +	break; +      } +    } +    if (i == 6 && n > 7 && fontName2[6] == '+') { +      fontName2 += 7; +      n -= 7; +    } +  } else { +    fontName2 = NULL; +    n = 0; +  } + +  // get the style info from the font descriptor flags +  fixedWidth = font->isFixedWidth(); +  serif = font->isSerif(); +  bold = font->isBold(); +  italic = font->isItalic(); + +  if (fontName2) { + +    // look for a style tag at the end of the font name -- this +    // overrides the font descriptor bold/italic flags +    for (fst = fontStyleTags; fst->tag; ++fst) { +      if (n > fst->tagLen && +	  !strcasecmp(fontName2 + n - fst->tagLen, fst->tag)) { +	bold = fst->bold; +	italic = fst->italic; +	n -= fst->tagLen; +	if (n > 1 && (fontName2[n-1] == '-' || +		      fontName2[n-1] == ',' || +		      fontName2[n-1] == '.' || +		      fontName2[n-1] == '_')) { +	  --n; +	} +	break; +      } +    } + +    // look for a known font name -- this overrides the font descriptor +    // fixedWidth/serif flags +    for (sf = standardFonts; sf->name; ++sf) { +      if (!strncasecmp(fontName2, sf->name, n)) { +	fixedWidth = sf->fixedWidth; +	serif = sf->serif; +	break; +      } +    } +  } + +  // compute the scaling factor +  *scale = 1; +  if ((s = font->getMWidth())) { +    i = (fixedWidth ? 8 : serif ? 4 : 0) + (bold ? 2 : 0) + (italic ? 1 : 0); +    if (s < substFonts[i].mWidth) { +      *scale = s / substFonts[i].mWidth; +    } +  } + +  *family = fixedWidth ? "monospace" : serif ? "serif" : "sans-serif"; +  *weight = bold ? "bold" : "normal"; +  *style = italic ? "italic" : "normal"; +} | 
