1 | #!/usr/bin/python |
---|
2 | import sys |
---|
3 | import os, errno |
---|
4 | from subprocess import Popen, PIPE |
---|
5 | import tempfile |
---|
6 | import re |
---|
7 | import traceback |
---|
8 | |
---|
9 | wkhtml_safe_pattern = re.compile(r'^[] !#-[^-~]+$') |
---|
10 | wkhtml_args = ['--margin-bottom', '0mm', '--margin-top', '0mm', '--margin-left', '0mm', '--margin-right', '0mm', ] |
---|
11 | |
---|
12 | def is_safe_for_wkhtml(url): |
---|
13 | print "Checking %s" % (url, ) |
---|
14 | return wkhtml_safe_pattern.match(url) |
---|
15 | |
---|
16 | def convert_pdf_to_jpg(pdf, dest, ): |
---|
17 | # pdftoppm < $pdf | pnmcrop | pnmscale 0.75 | pnmtojpeg --optimize > $dest |
---|
18 | pdffile = open(pdf, 'r', ) |
---|
19 | try: |
---|
20 | os.unlink(dest) |
---|
21 | except OSError as exc: |
---|
22 | if exc.errno == errno.ENOENT: |
---|
23 | pass |
---|
24 | else: raise |
---|
25 | jpgfile = open(dest, 'w', ) |
---|
26 | p1 = Popen(['pdftoppm'], stdin=pdffile, stdout=PIPE, ) |
---|
27 | p2 = Popen(['pnmcrop'], stdin=p1.stdout, stdout=PIPE, ) |
---|
28 | p3 = Popen(['pnmscale', '0.75', ], stdin=p2.stdout, stdout=PIPE, ) |
---|
29 | p4 = Popen(['pnmtojpeg', '--optimize', ], stdin=p3.stdout, stdout=jpgfile, ) |
---|
30 | p4.wait() |
---|
31 | print "Theoretically, wrote JPG to '%s'" % (dest, ) |
---|
32 | |
---|
33 | def generate_webpage_previews(websites): |
---|
34 | """ |
---|
35 | Generate previews of websites. |
---|
36 | |
---|
37 | Takes one argument --- a list of (source url, destination image |
---|
38 | location) pairs. |
---|
39 | |
---|
40 | Returns a list of (url, errmsg, ) pairs indicating failed conversions. |
---|
41 | """ |
---|
42 | |
---|
43 | preview_requests = [] |
---|
44 | jpg_convert_requests = [] |
---|
45 | tmpfiles = [] |
---|
46 | failures = [] |
---|
47 | for url, dest in websites: |
---|
48 | if is_safe_for_wkhtml(url): |
---|
49 | tmpfile = tempfile.NamedTemporaryFile(delete=False) |
---|
50 | assert(is_safe_for_wkhtml(tmpfile.name)) |
---|
51 | preview_requests.append('"%s" "%s"' % (url, tmpfile.name,)) |
---|
52 | jpg_convert_requests.append((url, tmpfile.name, dest,)) |
---|
53 | tmpfiles.append(tmpfile.name) |
---|
54 | tmpfile.close() |
---|
55 | else: |
---|
56 | failures.append((url, "URL '%s' not safe for wkhtml" % (url, ), )) |
---|
57 | wkhtml = Popen(['util/wkhtmltopdf', '--read-args-from-stdin', ] + wkhtml_args, stdin=PIPE, ) |
---|
58 | wkhtml.communicate("\n".join(preview_requests)) |
---|
59 | |
---|
60 | for url, pdf, dest in jpg_convert_requests: |
---|
61 | try: |
---|
62 | convert_pdf_to_jpg(pdf, dest) |
---|
63 | except Exception, e: |
---|
64 | raise |
---|
65 | failures.append(( |
---|
66 | url, |
---|
67 | "URL '%s' not JPGized:\n%s" % (url, traceback.format_exc()), |
---|
68 | )) |
---|
69 | |
---|
70 | return failures |
---|
71 | |
---|
72 | def generate_webpage_preview(url, dest): |
---|
73 | failures = generate_webpage_previews([(url, dest), ]) |
---|
74 | if failures: |
---|
75 | return failures[0][1] |
---|
76 | else: |
---|
77 | return None |
---|
78 | |
---|
79 | if __name__ == '__main__': |
---|
80 | print "In main" |
---|
81 | test_pairs = [ |
---|
82 | ("http://ua.mit.edu/", "/tmp/uamitedu.jpg", ), |
---|
83 | ("http://scripts.mit.edu/", "/tmp/scripts.jpg", ), |
---|
84 | ] |
---|
85 | generate_webpage_previews(test_pairs) |
---|