javascript - Convert pdf to html files using node.js and pdf.js - Stack Overflow

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to g

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to get those html pages rendered by browser in backend thus converting a pdf of n pages to n number of html files. I am using node.js as backend. I have tried pdf2html and other similar npm modules, they don't work great and have issues with some pdfs. Thank you for suggestions.

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to get those html pages rendered by browser in backend thus converting a pdf of n pages to n number of html files. I am using node.js as backend. I have tried pdf2html and other similar npm modules, they don't work great and have issues with some pdfs. Thank you for suggestions.

Share Improve this question asked Jun 10, 2019 at 12:52 last_fixlast_fix 3791 gold badge4 silver badges20 bronze badges 3
  • Your Solution is here-> bytescout./articles/… – Anand Choudhary Commented Jun 10, 2019 at 12:57
  • this is not free! :( – last_fix Commented Jun 10, 2019 at 13:04
  • pdf.js convert pdf to image (canvas, png etc. ). It won't convert PDF to HTML. – shaochuancs Commented Jun 10, 2019 at 15:05
Add a ment  | 

1 Answer 1

Reset to default 3

Maybe I found something similar - I am working with local PDF file and browser. I made small changes in ready made viewer.js / PDF.js, it should be possible to process using both Node.js & browser.

This script include's PDF specified by argument to viewer.js Webpack and start browser.

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer.js');
    let wp = fs.readFileSync(viewerJSpath, 'utf-8');
    const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath, wp, 'utf-8');
    const c = path.join(__dirname, 'viewer.html');
    chp.execSync(c);
});

Then tried to add original width as next style parameter to renderTextLayer's appendText method and elements sort by position to TextLayerBuilder's render method next2 this.textLayerDiv.appendChild(textLayerFrag);.

All mentioned PDF.js changes on my Github it seems only web and build folders are required (except npm i -g datauri fox example).

Readability improvements ? #12512 PR

Console script / bookmarklet


Using puppeteer and slightly modified PDF.js it is possible to convert directly (works both head/less, but element sizes slightly differ)

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();
});

Fixes required for puppeteer/chromium:

const message = exception?.message; // => exception.message
page: this.pageLabel ?? this.id // => this.pageLabel || this.id

viewer.js => viewerSrc.js basic additions:

function webViewerPageRendered({
...
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    }); // generate all remaining pages
  }
}

class BaseViewer {
  constructor(options) {
    this.pageNo = []; // rendered pages array
...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
    if (this.pageNo.indexOf(val) < 0) {
      this.pageNo.push(val);
    }
    if (this.pagesCount - 1 <= this.pageNo.length) {
      window.reader(elLists); // sent result back 2 node.js
    }

And result looks like {PageNo:{ElNo:{data}, ...}, ...} and could be simply translated to web page or further processed.

{
    "1": {
        "0": {
            "x": 99.9871,
            "y": 98.0496,
            "w": 557.695,
            "h": 22,
            "text": "Trace-based Just-in-Time Type Specialization for Dynamic",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 99.9871px; top: 98.0496px; width: 557.695px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.970163);"
        },
        "1": {
            "x": 327.478,
            "y": 122.793,
            "w": 102.707,
            "h": 22,
            "text": "Languages",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 327.478px; top: 122.793px; width: 102.707px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.932262);"
        },
...
    "2": {
        "0": {
            "x": 393.677,
            "y": 90.3408,
            "w": 192.909,
            "h": 11,
            "text": "1 for (var i = 2; i < 100; ++i) {",
            "ff": "monospace",
            "fs": "11.1347px",
            "cssText": "left: 393.677px; top: 90.3408px; width: 192.909px; font-size: 11.1347px; font-family: monospace; transform: scaleX(0.875232);"
        },
        "1": {
            "x": 67.0588,
            "y": 91.7599,
            "w": 173.346,
            "h": 11,
            "text": "Hence, recording and piling a trace",
            "ff": "sans-serif",
            "fs": "11.1347px",
            "cssText": "left: 67.0588px; top: 91.7599px; width: 173.346px; font-size: 11.1347px; font-family: sans-serif; transform: scaleX(0.895175);"
        },






Summary of changes (in original gh-pages branch):

- changes in PDF.js

  function appendText(task, geom, styles) {
...
    let left, top;
+++              , width;
...
    if (angle === 0) {
      left = tx[4];
      top = tx[5] - fontAscent;
    } else {
      left = tx[4] + fontAscent * Math.sin(angle);
      top = tx[5] - fontAscent * Math.cos(angle);
    }
+++ width = geom.width * task._viewport.transform[0];

    textDiv.style.left = `${left}px`;
    textDiv.style.top = `${top}px`;
+++ textDiv.style.width = `${width}px`;

- new nodeView.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const viewerJSpath = path.join(__dirname, './viewer');
const content = datauri(pdf);
let wp = fs.readFileSync(viewerJSpath, 'utf-8');
const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
const c = path.join(__dirname, 'viewer.ff');
chp.execSync(c);

- new openFF.bat: start node nodeView.js %1

- new pdf2sortedMergedTexts.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();

});

- changed viewer.js -> viewerSrc.js

function webViewerPageRendered({
... +++
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    });
  }
}
...
class BaseViewer {
  constructor(options) {
+++ this.pageNo = [];

...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
+++ if (this.pageNo.indexOf(val) < 0) {
+++   this.pageNo.push(val);
+++   console.log(this.pageNo);
+++ }
+++ if (this.pagesCount - 1 <= this.pageNo.length) {
+++   window.reader(elLists);
+++ }

    this._currentPageNumber = val;

  render(timeout = 0) {
...
    this.textLayerRenderTask.promise.then(() => {
      this.textLayerDiv.appendChild(textLayerFrag);
+++   this.reorder(this.textLayerDiv);

... new
  reorder(_src) {
    const src = _src.children;
    let els = [];
    const elDest = [];
    for (let j = 0; j < src.length; j++) {
        const i = src[j];
        if (i.className === 'endOfContent') continue;
        els.push({ x: parseFloat(i.style.left), y: parseFloat(i.style.top), w: parseFloat(i.style.width), h: i.offsetHeight, text: i.innerText, ff: i.style.fontFamily, fs: i.style.fontSize, cssText: i.style.cssText });
    }
    els.sort((a, b) => {
      if (Math.abs(a.y - b.y) <= 1) {
          if (Math.abs(a.x - b.x) <= 1) return 0;
          else return a.x - b.x;
      } else return a.y - b.y;
    });
    let elMin = els[0];
    for (let i = 1; i < els.length; i++) {
         if (elMin.x + elMin.w + 1 >= els[i].x &&
          Math.abs(elMin.y - els[i].y) < 1 &&
          elMin.h === els[i].h &&
          elMin.ff === els[i].ff &&
          elMin.fs === els[i].fs) {
            elMin.text += els[i].text;
            elMin.w = els[i].x + els[i].w - elMin.x;
            if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
            continue;
        }
        if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
        elMin = els[i];
    }
    if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
    els = _src;
    while (els.lastChild) els.removeChild(els.lastChild);
    const elList = [];
    if (window.elLists === undefined) window.elLists = {};
    const uqIdx = { x: [], y: [] };
    for (let i = 0; i < elDest.length; i++) {
        const o = document.createElement('DIV');
        o.innerHTML = elDest[i].text;
        o.setAttribute('style', elDest[i].cssText + 'width:' + elDest[i].w + 'px;position:absolute;');
        els.appendChild(o);
        elList.push([elDest[i].x, elDest[i].x + elDest[i].w, o, elDest[i].y, elDest[i].y + elDest[i].h, elDest[i].text]);
        if (uqIdx.x.indexOf(elDest[i].x) < 0) uqIdx.x.push(elDest[i].x);
        if (uqIdx.y.indexOf(elDest[i].y) < 0) uqIdx.y.push(elDest[i].y);
    }
    elLists[_src.parentElement.getAttribute("data-page-Number")] = Object.assign({}, elDest);
  }

- changed viewer.css

+++ input{padding:0px;border:1px solid #e0e0e0;}input:focus{background-color:red;}

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1744185562a4562190.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信