[0265c6d] | 1 | #!/usr/bin/python |
---|
| 2 | import sys |
---|
| 3 | import os, errno |
---|
| 4 | from subprocess import Popen, PIPE |
---|
| 5 | import tempfile |
---|
| 6 | import re |
---|
| 7 | import traceback |
---|
| 8 | |
---|
| 9 | wkhtml_safe_pattern = re.compile(r'^[] !#-[^-~]+$') |
---|
| 10 | wkhtml_args = ['--margin-bottom', '0mm', '--margin-top', '0mm', '--margin-left', '0mm', '--margin-right', '0mm', ] |
---|
| 11 | |
---|
| 12 | def is_safe_for_wkhtml(url): |
---|
| 13 | print "Checking %s" % (url, ) |
---|
| 14 | return wkhtml_safe_pattern.match(url) |
---|
| 15 | |
---|
| 16 | def convert_pdf_to_jpg(pdf, dest, ): |
---|
| 17 | # pdftoppm < $pdf | pnmcrop | pnmscale 0.75 | pnmtojpeg --optimize > $dest |
---|
| 18 | pdffile = open(pdf, 'r', ) |
---|
| 19 | try: |
---|
| 20 | os.unlink(dest) |
---|
| 21 | except OSError as exc: |
---|
| 22 | if exc.errno == errno.ENOENT: |
---|
| 23 | pass |
---|
| 24 | else: raise |
---|
| 25 | jpgfile = open(dest, 'w', ) |
---|
| 26 | p1 = Popen(['pdftoppm'], stdin=pdffile, stdout=PIPE, ) |
---|
| 27 | p2 = Popen(['pnmcrop'], stdin=p1.stdout, stdout=PIPE, ) |
---|
| 28 | p3 = Popen(['pnmscale', '0.75', ], stdin=p2.stdout, stdout=PIPE, ) |
---|
| 29 | p4 = Popen(['pnmtojpeg', '--optimize', ], stdin=p3.stdout, stdout=jpgfile, ) |
---|
| 30 | p4.wait() |
---|
| 31 | print "Theoretically, wrote JPG to '%s'" % (dest, ) |
---|
| 32 | |
---|
| 33 | def generate_webpage_previews(websites): |
---|
| 34 | """ |
---|
| 35 | Generate previews of websites. |
---|
| 36 | |
---|
| 37 | Takes one argument --- a list of (source url, destination image |
---|
| 38 | location) pairs. |
---|
| 39 | |
---|
| 40 | Returns a list of (url, errmsg, ) pairs indicating failed conversions. |
---|
| 41 | """ |
---|
| 42 | |
---|
| 43 | preview_requests = [] |
---|
| 44 | jpg_convert_requests = [] |
---|
| 45 | tmpfiles = [] |
---|
| 46 | failures = [] |
---|
| 47 | for url, dest in websites: |
---|
| 48 | if is_safe_for_wkhtml(url): |
---|
| 49 | tmpfile = tempfile.NamedTemporaryFile(delete=False) |
---|
| 50 | assert(is_safe_for_wkhtml(tmpfile.name)) |
---|
| 51 | preview_requests.append('"%s" "%s"' % (url, tmpfile.name,)) |
---|
| 52 | jpg_convert_requests.append((url, tmpfile.name, dest,)) |
---|
| 53 | tmpfiles.append(tmpfile.name) |
---|
| 54 | tmpfile.close() |
---|
| 55 | else: |
---|
| 56 | failures.append((url, "URL '%s' not safe for wkhtml" % (url, ), )) |
---|
| 57 | wkhtml = Popen(['util/wkhtmltopdf', '--read-args-from-stdin', ] + wkhtml_args, stdin=PIPE, ) |
---|
| 58 | wkhtml.communicate("\n".join(preview_requests)) |
---|
| 59 | |
---|
| 60 | for url, pdf, dest in jpg_convert_requests: |
---|
| 61 | try: |
---|
| 62 | convert_pdf_to_jpg(pdf, dest) |
---|
| 63 | except Exception, e: |
---|
| 64 | raise |
---|
| 65 | failures.append(( |
---|
| 66 | url, |
---|
| 67 | "URL '%s' not JPGized:\n%s" % (url, traceback.format_exc()), |
---|
| 68 | )) |
---|
| 69 | |
---|
| 70 | return failures |
---|
| 71 | |
---|
| 72 | def generate_webpage_preview(url, dest): |
---|
| 73 | failures = generate_webpage_previews([(url, dest), ]) |
---|
| 74 | if failures: |
---|
| 75 | return failures[0][1] |
---|
| 76 | else: |
---|
| 77 | return None |
---|
| 78 | |
---|
| 79 | if __name__ == '__main__': |
---|
| 80 | print "In main" |
---|
| 81 | test_pairs = [ |
---|
| 82 | ("http://ua.mit.edu/", "/tmp/uamitedu.jpg", ), |
---|
| 83 | ("http://scripts.mit.edu/", "/tmp/scripts.jpg", ), |
---|
| 84 | ] |
---|
| 85 | generate_webpage_previews(test_pairs) |
---|