| 1 | #!/usr/bin/python |
|---|
| 2 | import sys |
|---|
| 3 | import os, errno |
|---|
| 4 | from subprocess import Popen, PIPE |
|---|
| 5 | import tempfile |
|---|
| 6 | import re |
|---|
| 7 | import traceback |
|---|
| 8 | |
|---|
| 9 | wkhtml_safe_pattern = re.compile(r'^[] !#-[^-~]+$') |
|---|
| 10 | wkhtml_args = ['--margin-bottom', '0mm', '--margin-top', '0mm', '--margin-left', '0mm', '--margin-right', '0mm', ] |
|---|
| 11 | |
|---|
| 12 | def is_safe_for_wkhtml(url): |
|---|
| 13 | print "Checking %s" % (url, ) |
|---|
| 14 | return wkhtml_safe_pattern.match(url) |
|---|
| 15 | |
|---|
| 16 | def convert_pdf_to_jpg(pdf, dest, ): |
|---|
| 17 | # pdftoppm < $pdf | pnmcrop | pnmscale 0.75 | pnmtojpeg --optimize > $dest |
|---|
| 18 | pdffile = open(pdf, 'r', ) |
|---|
| 19 | try: |
|---|
| 20 | os.unlink(dest) |
|---|
| 21 | except OSError as exc: |
|---|
| 22 | if exc.errno == errno.ENOENT: |
|---|
| 23 | pass |
|---|
| 24 | else: raise |
|---|
| 25 | jpgfile = open(dest, 'w', ) |
|---|
| 26 | p1 = Popen(['pdftoppm'], stdin=pdffile, stdout=PIPE, ) |
|---|
| 27 | p2 = Popen(['pnmcrop'], stdin=p1.stdout, stdout=PIPE, ) |
|---|
| 28 | p3 = Popen(['pnmscale', '0.75', ], stdin=p2.stdout, stdout=PIPE, ) |
|---|
| 29 | p4 = Popen(['pnmtojpeg', '--optimize', ], stdin=p3.stdout, stdout=jpgfile, ) |
|---|
| 30 | p4.wait() |
|---|
| 31 | print "Theoretically, wrote JPG to '%s'" % (dest, ) |
|---|
| 32 | |
|---|
| 33 | def generate_webpage_previews(websites): |
|---|
| 34 | """ |
|---|
| 35 | Generate previews of websites. |
|---|
| 36 | |
|---|
| 37 | Takes one argument --- a list of (source url, destination image |
|---|
| 38 | location) pairs. |
|---|
| 39 | |
|---|
| 40 | Returns a list of (url, errmsg, ) pairs indicating failed conversions. |
|---|
| 41 | """ |
|---|
| 42 | |
|---|
| 43 | preview_requests = [] |
|---|
| 44 | jpg_convert_requests = [] |
|---|
| 45 | tmpfiles = [] |
|---|
| 46 | failures = [] |
|---|
| 47 | for url, dest in websites: |
|---|
| 48 | if is_safe_for_wkhtml(url): |
|---|
| 49 | tmpfile = tempfile.NamedTemporaryFile(delete=False) |
|---|
| 50 | assert(is_safe_for_wkhtml(tmpfile.name)) |
|---|
| 51 | preview_requests.append('"%s" "%s"' % (url, tmpfile.name,)) |
|---|
| 52 | jpg_convert_requests.append((url, tmpfile.name, dest,)) |
|---|
| 53 | tmpfiles.append(tmpfile.name) |
|---|
| 54 | tmpfile.close() |
|---|
| 55 | else: |
|---|
| 56 | failures.append((url, "URL '%s' not safe for wkhtml" % (url, ), )) |
|---|
| 57 | wkhtml = Popen(['util/wkhtmltopdf', '--read-args-from-stdin', ] + wkhtml_args, stdin=PIPE, ) |
|---|
| 58 | wkhtml.communicate("\n".join(preview_requests)) |
|---|
| 59 | |
|---|
| 60 | for url, pdf, dest in jpg_convert_requests: |
|---|
| 61 | try: |
|---|
| 62 | convert_pdf_to_jpg(pdf, dest) |
|---|
| 63 | except Exception, e: |
|---|
| 64 | raise |
|---|
| 65 | failures.append(( |
|---|
| 66 | url, |
|---|
| 67 | "URL '%s' not JPGized:\n%s" % (url, traceback.format_exc()), |
|---|
| 68 | )) |
|---|
| 69 | |
|---|
| 70 | return failures |
|---|
| 71 | |
|---|
| 72 | def generate_webpage_preview(url, dest): |
|---|
| 73 | failures = generate_webpage_previews([(url, dest), ]) |
|---|
| 74 | if failures: |
|---|
| 75 | return failures[0][1] |
|---|
| 76 | else: |
|---|
| 77 | return None |
|---|
| 78 | |
|---|
| 79 | if __name__ == '__main__': |
|---|
| 80 | print "In main" |
|---|
| 81 | test_pairs = [ |
|---|
| 82 | ("http://ua.mit.edu/", "/tmp/uamitedu.jpg", ), |
|---|
| 83 | ("http://scripts.mit.edu/", "/tmp/scripts.jpg", ), |
|---|
| 84 | ] |
|---|
| 85 | generate_webpage_previews(test_pairs) |
|---|