Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
931 views
in Technique[技术] by (71.8m points)

google apps script - Merge Multiple PDF's into one PDF

I am having some issues with my code. I am trying to loop through a Drive folder that contains many PDFs and then merge these into one file. When I use my code it just creates a PDF for the last PDF in the Drive folder and not merge them all together as expected.

function MergeFiles(){
  var folder = DocsList.getFolderById('myFolderID'); 
  var files = folder.getFiles(); 
  var blobs = [];    
  for( var i in files )   
    blobs.push(files[i].getBlob().getBytes());
  Logger.log(blobs.push(files[i].getBlob().getBytes()));
  var myPDF = Utilities.newBlob(blobs.pop(), "application/pdf", "newPDF.pdf");
  folder.createFile(myPDF);
}
See Question&Answers more detail:os

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)

So there is more to this than simply combining the data from each file. The actual usable data for each file is "packaged" with markups and other code (similar to HTML and other document formats). You actually have to decode each PDF file, combine the necessary parts, then re-encode with new "packaging." This requires a working knowledge of the PDF specifications and structure, available freely from Adobe here.

I used this information to write a script sufficient for my needs. It does not account for every possibility, however, so particularly merging any documents requiring PDF-1.4 and higher, this will need quite a bit of work.

/**
 * Merges all given PDF files into one.
 *
 * @param {Folder} directory the folder to store the output file
 * @param {string} name the desired name of the output file
 * @param {File} pdf1 the first PDF file
 * @param {File} pdf2 the second PDF file
 * @param {File} opt_pdf3 [optional] the third PDF file; add as many more as you like
 *
 * @return {File} the merged file
 */
function mergePdfs(directory, name, pdf1, pdf2, opt_pdf3) {

  if (name.slice(-4) != '.pdf') {

    name = name + '.pdf';

  }
  var newObjects = ['1 0 obj
<</Type/Catalog/Pages 2 0 R >>
endobj'];
  var pageAddresses = [];
  for (var argumentIndex = 2; argumentIndex < arguments.length; argumentIndex++) {

    var bytes = arguments[argumentIndex].getBlob().getBytes();
    var xrefByteOffset = '';
    var byteIndex = bytes.length - 1;
    while (!/sstartxrefs/.test(xrefByteOffset)) {

      xrefByteOffset = String.fromCharCode(bytes[byteIndex]) + xrefByteOffset;
      byteIndex--;

    }
    xrefByteOffset = +(/sd+s/.exec(xrefByteOffset)[0]);
    var objectByteOffsets = [];
    var trailerDictionary = '';
    var rootAddress = '';
    do {

      var xrefTable = '';
      var trailerEndByteOffset = byteIndex;
      byteIndex = xrefByteOffset;
      for (byteIndex; byteIndex <= trailerEndByteOffset; byteIndex++) {

        xrefTable = xrefTable + String.fromCharCode(bytes[byteIndex]);

      }
      xrefTable = xrefTable.split(/s*trailers*/);
      trailerDictionary = xrefTable[1];
      if (objectByteOffsets.length < 1) {

        rootAddress = /d+s+d+s+R/.exec(//Roots*d+s+d+s+R/.exec(trailerDictionary)[0])[0].replace('R', 'obj');

      }
      xrefTable = xrefTable[0].split('
');
      xrefTable.shift();
      while (xrefTable.length > 0) {

        var xrefSectionHeader = xrefTable.shift().split(/s+/);
        var objectNumber = +xrefSectionHeader[0];
        var numberObjects = +xrefSectionHeader[1];
        for (var entryIndex = 0; entryIndex < numberObjects; entryIndex++) {

          var entry = xrefTable.shift().split(/s+/);
          objectByteOffsets.push([[objectNumber, +entry[1], 'obj'], +entry[0]]);
          objectNumber++;

        }

      }
      if (/s*/Prev/.test(trailerDictionary)) {

        xrefByteOffset = +(/s*d+s/.exec(/s*/Prevs*d+s/.exec(trailerDictionary)[0])[0]);

      }

    } while (/s*/Prev/.test(trailerDictionary));
    var rootObject = getObject(rootAddress, objectByteOffsets, bytes);
    var pagesAddress = /d+s+d+s+R/.exec(//Pagess*d+s+d+s+R/.exec(rootObject)[0])[0].replace('R', 'obj');
    var pagesObject = getObject(pagesAddress, objectByteOffsets, bytes);
    var objects = getDependencies(pagesObject, objectByteOffsets, bytes);
    var newObjectsInsertionIndex = newObjects.length;
    for (var objectIndex = 0; objectIndex < objects.length; objectIndex++) {

      var newObjectAddress = [(newObjects.length + 3) + '', 0 + '', 'obj'];
      if (!Array.isArray(objects[objectIndex])) {

        objects[objectIndex] = [objects[objectIndex]];

      }
      objects[objectIndex].unshift(newObjectAddress);
      var objectAddress = objects[objectIndex][1].match(/d+s+d+s+obj/)[0].split(/s+/);
      objects[objectIndex].splice(1, 0, objectAddress);
      if (//Types*/Page[^s]/.test(objects[objectIndex][2])) {

        objects[objectIndex][2] = objects[objectIndex][2].replace(//Parents*d+s+d+s+R/.exec(objects[objectIndex][2])[0], '/Parent 2 0 R');
        pageAddresses.push(newObjectAddress.join(' ').replace('obj', 'R'));

      }
      var addressRegExp = new RegExp(objectAddress[0] + '\s+' + objectAddress[1] + '\s+' + 'obj');
      objects[objectIndex][2] = objects[objectIndex][2].replace(addressRegExp.exec(objects[objectIndex][2])[0], newObjectAddress.join(' '));
      newObjects.push(objects[objectIndex]);

    }
    for (var referencingObjectIndex = newObjectsInsertionIndex; referencingObjectIndex < newObjects.length; referencingObjectIndex++) {

      var references = newObjects[referencingObjectIndex][2].match(/d+s+d+s+R/g);
      if (references != null) {

        var string = newObjects[referencingObjectIndex][2];
        var referenceIndices = [];
        var currentIndex = 0;
        for (var referenceIndex = 0; referenceIndex < references.length; referenceIndex++) {

          referenceIndices.push([]);
          referenceIndices[referenceIndex].push(string.slice(currentIndex).indexOf(references[referenceIndex]) + currentIndex);
          referenceIndices[referenceIndex].push(references[referenceIndex].length);
          currentIndex += string.slice(currentIndex).indexOf(references[referenceIndex]);

        }
        for (var referenceIndex = 0; referenceIndex < references.length; referenceIndex++) {

          var objectAddress = references[referenceIndex].replace('R', 'obj').split(/s+/);
          for (var objectIndex = newObjectsInsertionIndex; objectIndex < newObjects.length; objectIndex++) {

            if (arrayEquals(objectAddress, newObjects[objectIndex][1])) {

              var length = string.length;
              newObjects[referencingObjectIndex][2] = string.slice(0, referenceIndices[referenceIndex][0]) + newObjects[objectIndex][0].join(' ').replace('obj', 'R') +
                string.slice(referenceIndices[referenceIndex][0] + referenceIndices[referenceIndex][1]);
              string = newObjects[referencingObjectIndex][2];
              var newLength = string.length;
              if (!(length == newLength)) {

                for (var subsequentReferenceIndex = referenceIndex + 1; subsequentReferenceIndex < references.length; subsequentReferenceIndex++) {

                  referenceIndices[subsequentReferenceIndex][0] += (newLength - length);

                }

              }
              break;

            }

          }

        }

      }

    }
    for (var objectIndex = newObjectsInsertionIndex; objectIndex < newObjects.length; objectIndex++) {

      if (Array.isArray(newObjects[objectIndex])) {

        if (newObjects[objectIndex][3] != undefined) {

          newObjects[objectIndex] = newObjects[objectIndex].slice(2);

        } else {

          newObjects[objectIndex] = newObjects[objectIndex][2];

        }

      }

    }

  }
  newObjects.splice(1, 0, '2 0 obj
<</Type/Pages/Count ' + pageAddresses.length + ' /Kids [' + pageAddresses.join(' ') + ' ]>>
endobj');
  newObjects.splice(2, 0, '3 0 obj
<</Title (' + name + ') /CreationDate (D' +
       Utilities.formatDate(new Date(), CalendarApp.getDefaultCalendar().getTimeZone(), 'yyyyMMddHHmmssZ').slice(0, -2) + "'00) /ModDate (D" + Utilities.formatDate(new Date(),
       CalendarApp.getDefaultCalendar().getTimeZone(), 'yyyyMMddHHmmssZ').slice(0, -2) + "'00)>>
endobj");
  var byteOffsets = [0];
  var bytes = [];
  var header = '%PDF-1.3
';
  for (var headerIndex = 0; headerIndex < header.length; headerIndex++) {

    bytes.push(header.charCodeAt(headerIndex));

  }
  bytes.push('%'.charCodeAt(0));
  for (var characterCode = -127; characterCode < -123; characterCode++) {

    bytes.push(characterCode);

  }
  bytes.push('
'.charCodeAt(0));
  bytes.push('
'.charCodeAt(0));
  while (newObjects.length > 0) {

    byteOffsets.push(bytes.length);
    var object = newObjects.shift();
    if (Array.isArray(object)) {

      var streamKeyword = /streams*
/.exec(object[0])[0];
      if (streamKeyword.indexOf('

') > streamKeyword.length - 3) {

        streamKeyword = streamKeyword.slice(0, -1);

      } else if (streamKeyword.indexOf('

') > streamKeyword.length - 5) {

        streamKeyword = streamKeyword.slice(0, -2);

      }
      var streamIndex = object[0].indexOf(streamKeyword) + streamKeyword.length;
      for (var objectIndex = 0; objectIndex < streamIndex; objectIndex++) {

        bytes.push(object[0].charCodeAt(objectIndex))

      }
      bytes = bytes.concat(object[1]);
      for (var objectIndex = streamIndex; objectIndex < object[0].length; objectIndex++) {

        bytes.push(object[0].charCodeAt(objectIndex));

      }

    } else {

      for (var objectIndex = 0; objectIndex < object.length; objectIndex++) {

        bytes.push(object.charCodeAt(objectIndex));

      }

    }
    bytes.push('
'.charCodeAt(0));
    bytes.push('
'.charCodeAt(0));

  }
  var xrefByteOffset = bytes.length;
  var xrefHeader = 'xref
';
  for (var xrefHeaderIndex = 0; xrefHeaderIndex < xrefHeader.length; xrefHeaderIndex++) {

    bytes.push(xrefHeader.charCodeAt(xrefHeaderIndex));

  }
  var xrefSectionHeader = '0 ' + byteOffsets.length + '
';
  for (var xrefSectionHeaderIndex = 0; xrefSectionHeaderIndex < xrefSectionHeader.length; xrefSectionHeaderIndex++) {

    bytes.push(xrefSectionHeader.charCodeAt(xrefSectionHeaderIndex));

  }
  for (var byteOffsetIndex = 0; byteOffsetIndex < byteOffsets.length; byteOffsetIndex++) {

    for (var byteOffsetStringIndex = 0; byteOffsetStringIndex < 10; byteOffsetStringIndex++) {

      bytes.push(Utilities.formatString('%010d', byteOffsets[byteOffsetIndex]).charCodeAt(byteOffsetStringIndex));

    }
    bytes.push(' '.charCodeAt(0));
    if (byteOffsetIndex == 0) {

      for (var generationStringIndex = 0; generationStringIndex < 5; generationStringIndex++) {

        bytes.push('65535'.charCodeAt(generationStringIndex));

      }
      for (var keywordIndex = 0; keywordIndex < 2; keywordIndex++) {

        bytes.push(' f'.charCodeAt(keywordIndex));

      }

    } else {

      for (var generationStringIndex = 0; generationStringIndex < 5; generationStringIndex++) {

        bytes.push('0'.charCodeAt(0));

      }
      for (var keywordIndex = 0; keywordIndex < 2; keywordIndex++) {

        bytes.push(' n'.charCodeAt(keywordIndex));

      }

    }
    bytes.push('
'.charCodeAt(0));
    bytes.push('
'.charCodeAt(0));

  }
  for (var trailerHeaderIndex = 0; trailerHeaderIndex < 9; trailerHeaderIndex++) {

    bytes.push('trailer
'.charCodeAt(trailerHeaderIndex));

  }
  var idBytes = Utilities.computeDigest(Utilities.DigestAlgorithm.MD5, (new Date).toString());
  var id = '';
  for (var idByteIndex = 0; idByteIndex < idBytes.length; idByteIndex++) {

    id = id + ('0' + (idBytes[idByteIndex] & 0xFF).toString(16)).slice(-2);

  }
  var trailer = '<</Size ' + (byteOffsets.length) + ' /Root 1 0 R /Info 2 0 R /ID [<' + id + '> <' + id + '>]>>
startxref
' + xrefByteOffset + '
%%EOF';
  for (var trailerIndex = 0; trailerIndex < trailer.length; trailerIndex++) {

    bytes.push(trailer.charCodeAt(trailerIndex));

  }
  return directory.createFile(Utilities.newBlob(bytes, 'application/pdf', name));
  function getObject(objectAddress, objectByteOffsets, bytes) {

    objectAddress = objectAddress.split(/s+/);
    for (var addressIndex = 0; addressIndex < 2; addressIndex++) {

      objectAddress[addressIndex] = +o

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...