From: Antonio Ospite Date: Mon, 6 Jun 2016 14:48:24 +0000 (+0200) Subject: Initial import X-Git-Url: https://git.ao2.it/pdfstrip.git/commitdiff_plain/965494b69b9bb4182f8a2e50e2eaada0eac4b12f?ds=sidebyside Initial import --- 965494b69b9bb4182f8a2e50e2eaada0eac4b12f diff --git a/README b/README new file mode 100644 index 0000000..c114880 --- /dev/null +++ b/README @@ -0,0 +1,49 @@ +pdfstrip.py - strip objects from PDF files by specifying objects IDs + +Example of use: + + a. Uncompress the pdf file with PDFtk, e.g.: + + $ pdftk input.pdf output uncompressed.pdf uncompress + + b. Find the objects IDs you want to strip; for example for images, the + program pdfimages from poppler-utils can be used like this: + + 1. List all the images: + + $ pdfimages -list uncompressed.pdf > pdfimages.txt + + 2. Extract all the images: + + $ mkdir images + $ pdfimages -p -all uncompressed.pdf images/image + + 3. Isolate unique images, for an easier analysis: + + $ mkdir uniq-images + $ md5sum images/image-* | sort | uniq -w 32 | tr -s ' ' | cut -d ' ' -f 2 | while read file; do cp "$file" uniq-images/; done + + 4. Compare the file names with the content of pdfimages.txt from 1. and + find the objects IDs, the result is a list of objects IDs, like this: + + 53,52,51,50,49,48,66,65,64,63,62,68,103,102,101,111,110,109,108,107,106 + + c. Pass the list of objects to pdfstrip.py (here the list is shown sorted, + just for readability): + + $ ./pdfstrip.py uncompressed.pdf stripped.pdf 48,49,50,51,52,53,62,63,64,65,66,68,101,102,103,106,107,108,109,110,111 + + d. Re-compress the file: + + $ pdftk stripped.pdf output final.pdf compress + + +Limitations + +Sometimes pdfimages misses images, or report them as inlined even when they +are not, so you may need to look at the PDF source to spot the missing IDs. + +Inline images cannot be stripped by pdfstrip, but they are easy to spot in the +PDf source, they are delimited by markers "BI" and "EI" and there is always an +"ID" string between the two; removing the source code usually works but this +is a brute force approach. diff --git a/pdfstrip.py b/pdfstrip.py new file mode 100755 index 0000000..1b287f8 --- /dev/null +++ b/pdfstrip.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# +# pdfstrip.py - strip objects from PDF files by specifying objects IDs +# +# Copyright (C) 2016 Antonio Ospite +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import argparse +import logging +import sys + +from pdfrw import PdfReader, PdfWriter +from pdfrw.objects.pdfindirect import PdfIndirect + + +# pylint: disable=invalid-name +logger = logging.getLogger(__name__) +console_handler = logging.StreamHandler() +formatter = logging.Formatter('[%(levelname)s] %(funcName)s:%(lineno)d %(message)s') +console_handler.setFormatter(formatter) +logger.addHandler(console_handler) +logger.propagate = False + + +def resolve_objects_names(pdf, objects_ids): + logger.debug("objects_ids %s", objects_ids) + objects_names = [] + for objnum in objects_ids: + indirect_object = pdf.findindirect(objnum, 0) + if isinstance(indirect_object, PdfIndirect): + try: + real_object = indirect_object.real_value() + if real_object.Name: + objects_names.append(real_object.Name) + else: + logger.warning("Object %d has an empty 'Name' attribute", objnum) + except AttributeError: + logger.warning("Object %d has no 'Name' attribute", objnum) + else: + logger.warning("Object %d is not a PdfIndirect but a %s", + objnum, type(indirect_object)) + + logger.debug("objects_names %s\n", objects_names) + return objects_names + + +def strip_objects(pdf, objects_names): + for i, page in enumerate(pdf.pages): + logger.debug("Page %d", i + 1) + logger.debug("Before %s", page.Resources.XObject.keys()) + for obj in objects_names: + if obj in page.Resources.XObject: + del page.Resources.XObject[obj] + + logger.debug("After %s\n", page.Resources.XObject.keys()) + + return pdf + + +def validate_objects_ids(objects_ids_string): + try: + objects_ids = [int(obj) for obj in objects_ids_string.split(',')] + except (IndexError, ValueError): + raise argparse.ArgumentTypeError("%s contains an invalid value" % objects_ids_string) + + return objects_ids + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument("input_filename", + help="the input PDF file (better if uncompressed)") + parser.add_argument("output_filename", + help="the output PDF file") + parser.add_argument("objects_ids", + type=validate_objects_ids, + help="a comma-separated list of objects IDs") + parser.add_argument("-d", "--debug", action="store_true", + help="enable debug output") + args = parser.parse_args() + + if args.debug: + logger.setLevel(logging.DEBUG) + + pdf_data = PdfReader(args.input_filename) + + objects_names = resolve_objects_names(pdf_data, args.objects_ids) + pdf_data = strip_objects(pdf_data, objects_names) + + PdfWriter().write(args.output_filename, pdf_data) + + return 0 + + +if __name__ == '__main__': + sys.exit(main())