#! /usr/bin/python import httplib import re import os.path import sys import time # Set this to where you want them saved localFolder = '/home/joe/tmp/' def getPage (url): """ Return the contents of a url """ try: parts = url.split('//', 2)[1].split('/', 1) conn = httplib.HTTPConnection(parts[0]) print "Connected to "+parts[0] conn.request("GET", '/'+parts[1]) print "Sent request for "+parts[1] r = conn.getresponse() print "Got response" page = r.read() print "Read" conn.close() print "Closed" return page except: return False def savePage(url, fn): """ save a url to a file """ # Skip existing if (os.path.exists(fn)): return True contents = getPage(url) if (contents != False): try: f = open(fn, 'w') f.write(contents) f.close() print fn+" saved" return True except: print "Error saving file to "+fn return False else: print "Error fetching" return False def name (url): """ Simple url compliant basename """ parts = url.split('/') return parts[len(parts) - 1] def main (): """ Do the shit """ # Pick thread from cli arg if (len(sys.argv) == 2): url = sys.argv[1] else: print "Thread not specified. Pass url to 4chan thread as the argument to this program" return # Our matching pattern pat = re.compile('([^<]+)<\/span> (?:([^<]+)<\/span> )?(\d+\/\d+\/\d+\([^)]+\)\d+:\d+(?::\d+)?).+quote\(\'(\d+)\'\).+(http:\/\/images\.4chan\.org[^\.]+\.jpg).+
(.*)<\/blockquote>') # Go until 404 or some other issue while (True): # get contents of page contents = getPage(url) # That failed? if (contents == False): print "Failed fetching thread" break # make sure thread isn't dead, in which case it 404's if (re.search('4chan \- 404<\/title>', contents) != None): print "Thread dead" break # get posts posts = re.findall(pat, contents) i = 0 # Get all the images for post in posts: r = savePage (post[4], localFolder+name(post[4])) if (r): i = i + 1 # Stats for this pass if (i == 0): print "Nothing found with this pass" else: print "%d found with this pass" % i # Wait some time between checks time.sleep(5) print "Trying again.." if __name__ == '__main__': main()