summaryrefslogtreecommitdiff
path: root/tools/bug_tool/ClientCookie/_urllib2_support.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/bug_tool/ClientCookie/_urllib2_support.py')
-rw-r--r--tools/bug_tool/ClientCookie/_urllib2_support.py713
1 files changed, 713 insertions, 0 deletions
diff --git a/tools/bug_tool/ClientCookie/_urllib2_support.py b/tools/bug_tool/ClientCookie/_urllib2_support.py
new file mode 100644
index 0000000000..d767d08b25
--- /dev/null
+++ b/tools/bug_tool/ClientCookie/_urllib2_support.py
@@ -0,0 +1,713 @@
+"""Integration with Python standard library module urllib2.
+
+Also includes a redirection bugfix, support for parsing HTML HEAD blocks for
+the META HTTP-EQUIV tag contents, and following Refresh header redirects.
+
+Copyright 2002-2003 John J Lee <jjl@pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD License (see the file COPYING included with the
+distribution).
+
+"""
+
+import copy, time
+
+import ClientCookie
+from _ClientCookie import CookieJar, request_host
+from _Util import isstringlike
+from _Debug import _debug
+
+try: True
+except NameError:
+ True = 1
+ False = 0
+
+CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
+
+try:
+ from urllib2 import AbstractHTTPHandler
+except ImportError:
+ pass
+else:
+ import urlparse, urllib2, urllib, httplib, htmllib, formatter, string
+ from urllib2 import URLError, HTTPError
+ import types, string, socket
+ from cStringIO import StringIO
+ from _Util import seek_wrapper
+ try:
+ import threading
+ _threading = threading; del threading
+ except ImportError:
+ import dummy_threading
+ _threading = dummy_threading; del dummy_threading
+
+ # This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2
+ # (http://www.python.org/sf/549151)
+ # 2.2.3 is broken here (my fault!), 2.3 is fixed.
+ class HTTPRedirectHandler(urllib2.BaseHandler):
+ # maximum number of redirections before assuming we're in a loop
+ max_redirections = 10
+
+ # Implementation notes:
+
+ # To avoid the server sending us into an infinite loop, the request
+ # object needs to track what URLs we have already seen. Do this by
+ # adding a handler-specific attribute to the Request object. The value
+ # of the dict is used to count the number of times the same url has
+ # been visited. This is needed because this isn't necessarily a loop:
+ # there is more than one way to redirect (Refresh, 302, 303, 307).
+
+ # Another handler-specific Request attribute, original_url, is used to
+ # remember the URL of the original request so that it is possible to
+ # decide whether or not RFC 2965 cookies should be turned on during
+ # redirect.
+
+ # Always unhandled redirection codes:
+ # 300 Multiple Choices: should not handle this here.
+ # 304 Not Modified: no need to handle here: only of interest to caches
+ # that do conditional GETs
+ # 305 Use Proxy: probably not worth dealing with here
+ # 306 Unused: what was this for in the previous versions of protocol??
+
+ def redirect_request(self, newurl, req, fp, code, msg, headers):
+ """Return a Request or None in response to a redirect.
+
+ This is called by the http_error_30x methods when a redirection
+ response is received. If a redirection should take place, return a
+ new Request to allow http_error_30x to perform the redirect;
+ otherwise, return None to indicate that an HTTPError should be
+ raised.
+
+ """
+ if code in (301, 302, 303) or (code == 307 and not req.has_data()):
+ # Strictly (according to RFC 2616), 301 or 302 in response to
+ # a POST MUST NOT cause a redirection without confirmation
+ # from the user (of urllib2, in this case). In practice,
+ # essentially all clients do redirect in this case, so we do
+ # the same.
+ return Request(newurl, headers=req.headers)
+ else:
+ raise HTTPError(req.get_full_url(), code, msg, headers, fp)
+
+ def http_error_302(self, req, fp, code, msg, headers):
+ if headers.has_key('location'):
+ newurl = headers['location']
+ elif headers.has_key('uri'):
+ newurl = headers['uri']
+ else:
+ return
+ newurl = urlparse.urljoin(req.get_full_url(), newurl)
+
+ # XXX Probably want to forget about the state of the current
+ # request, although that might interact poorly with other
+ # handlers that also use handler-specific request attributes
+ new = self.redirect_request(newurl, req, fp, code, msg, headers)
+ if new is None:
+ return
+
+ # remember where we started from
+ if hasattr(req, "original_url"):
+ new.original_url = req.original_url
+ else:
+ new.original_url = req.get_full_url()
+
+ # loop detection
+ # .error_302_dict[(url, code)] is number of times url
+ # previously visited as a result of a redirection with this
+ # code (error_30x_dict would be a better name).
+ new.origin_req_host = req.origin_req_host
+ if not hasattr(req, 'error_302_dict'):
+ new.error_302_dict = req.error_302_dict = {(newurl, code): 1}
+ else:
+ ed = new.error_302_dict = req.error_302_dict
+ nr_visits = ed.get((newurl, code), 0)
+ # Refreshes generate fake 302s, so we can hit the same URL as
+ # a result of the same redirection code twice without
+ # necessarily being in a loop! So, allow two visits to each
+ # URL as a result of each redirection code.
+ if len(ed) < self.max_redirections and nr_visits < 2:
+ ed[(newurl, code)] = nr_visits + 1
+ else:
+ raise HTTPError(req.get_full_url(), code,
+ self.inf_msg + msg, headers, fp)
+
+ if ClientCookie.REDIRECT_DEBUG:
+ _debug("redirecting to %s", newurl)
+
+ # Don't close the fp until we are sure that we won't use it
+ # with HTTPError.
+ fp.read()
+ fp.close()
+
+ return self.parent.open(new)
+
+ http_error_301 = http_error_303 = http_error_307 = http_error_302
+
+ inf_msg = "The HTTP server returned a redirect error that would " \
+ "lead to an infinite loop.\n" \
+ "The last 30x error message was:\n"
+
+
+ class Request(urllib2.Request):
+ def __init__(self, url, data=None, headers={}):
+ urllib2.Request.__init__(self, url, data, headers)
+ self.unredirected_hdrs = {}
+
+ def add_unredirected_header(self, key, val):
+ # these headers do not persist from one request to the next in a chain
+ # of requests
+ self.unredirected_hdrs[string.capitalize(key)] = val
+
+ def has_key(self, header_name):
+ if (self.headers.has_key(header_name) or
+ self.unredirected_hdrs.has_key(header_name)):
+ return True
+ return False
+
+ def get(self, header_name, failobj=None):
+ if self.headers.has_key(header_name):
+ return self.headers[header_name]
+ if self.unredirected_headers.has_key(header_name):
+ return self.unredirected_headers[header_name]
+ return failobj
+
+
+ class BaseProcessor:
+ processor_order = 500
+
+ def add_parent(self, parent):
+ self.parent = parent
+ def close(self):
+ self.parent = None
+ def __lt__(self, other):
+ if not hasattr(other, "processor_order"):
+ return True
+ return self.processor_order < other.processor_order
+
+ class HTTPRequestUpgradeProcessor(BaseProcessor):
+ # upgrade Request to class with support for headers that don't get
+ # redirected
+ processor_order = 0 # before anything else
+
+ def http_request(self, request):
+ if not hasattr(request, "add_unredirected_header"):
+ request = Request(request._Request__original, request.data,
+ request.headers)
+ return request
+
+ https_request = http_request
+
+ class HTTPEquivProcessor(BaseProcessor):
+ """Append META HTTP-EQUIV headers to regular HTTP headers."""
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = seek_wrapper(response)
+ # grab HTTP-EQUIV headers and add them to the true HTTP headers
+ headers = response.info()
+ for hdr, val in parse_head(response):
+ headers[hdr] = val
+ response.seek(0)
+ return response
+
+ https_response = http_response
+
+ # XXX ATM this only takes notice of http responses -- probably
+ # should be independent of protocol scheme (http, ftp, etc.)
+ class SeekableProcessor(BaseProcessor):
+ """Make responses seekable."""
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ return seek_wrapper(response)
+ return response
+
+ https_response = http_response
+
+ # XXX if this gets added to urllib2, unverifiable would end up as an
+ # attribute on Request.
+ class HTTPCookieProcessor(BaseProcessor):
+ """Handle HTTP cookies."""
+ def __init__(self, cookies=None):
+ if cookies is None:
+ cookies = CookieJar()
+ self.cookies = cookies
+
+ def _unverifiable(self, request):
+ if hasattr(request, "error_302_dict") and request.error_302_dict:
+ redirect = True
+ else:
+ redirect = False
+ if (redirect or
+ (hasattr(request, "unverifiable") and request.unverifiable)):
+ unverifiable = True
+ else:
+ unverifiable = False
+ return unverifiable
+
+ def http_request(self, request):
+ unverifiable = self._unverifiable(request)
+ if not unverifiable:
+ # Stuff request-host of this origin transaction into Request
+ # object, because we need to know it to know whether cookies
+ # should be in operation during derived requests (redirects,
+ # specifically -- including refreshes).
+ request.origin_req_host = request_host(request)
+ self.cookies.add_cookie_header(request, unverifiable)
+ return request
+
+ def http_response(self, request, response):
+ unverifiable = self._unverifiable(request)
+ self.cookies.extract_cookies(response, request, unverifiable)
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+ class HTTPRefererProcessor(BaseProcessor):
+ """Add Referer header to requests.
+
+ This only makes sense if you use each RefererProcessor for a single
+ chain of requests only (so, for example, if you use a single
+ HTTPRefererProcessor to fetch a series of URLs extracted from a single
+ page, this will break).
+
+ """
+ def __init__(self):
+ self.referer = None
+
+ def http_request(self, request):
+ if ((self.referer is not None) and
+ not request.has_key("Referer")):
+ request.add_unredirected_header("Referer", self.referer)
+ return request
+
+ def http_response(self, request, response):
+ self.referer = response.geturl()
+ return response
+
+ https_request = http_request
+ https_response = http_response
+
+ class HTTPStandardHeadersProcessor(BaseProcessor):
+ def http_request(self, request):
+ host = request.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ if request.has_data(): # POST
+ data = request.get_data()
+ if not request.has_key('Content-type'):
+ request.add_unredirected_header(
+ 'Content-type',
+ 'application/x-www-form-urlencoded')
+ if not request.has_key('Content-length'):
+ request.add_unredirected_header(
+ 'Content-length', '%d' % len(data))
+
+ scheme, sel = urllib.splittype(request.get_selector())
+ sel_host, sel_path = urllib.splithost(sel)
+ if not request.has_key('Host'):
+ request.add_unredirected_header('Host', sel_host or host)
+ for name, value in self.parent.addheaders:
+ name = string.capitalize(name)
+ if not request.has_key(name):
+ request.add_unredirected_header(name, value)
+
+ return request
+
+ https_request = http_request
+
+ class HTTPResponseDebugProcessor(BaseProcessor):
+ processor_order = 900 # before redirections, after everything else
+
+ def http_response(self, request, response):
+ if not hasattr(response, "seek"):
+ response = seek_wrapper(response)
+ _debug(response.read())
+ _debug("*****************************************************")
+ response.seek(0)
+ return response
+
+ https_response = http_response
+
+ class HTTPRefreshProcessor(BaseProcessor):
+ """Perform HTTP Refresh redirections.
+
+ Note that if a non-200 HTTP code has occurred (for example, a 30x
+ redirect), this processor will do nothing.
+
+ By default, only zero-time Refresh headers are redirected. Use the
+ max_time constructor argument to allow Refresh with longer pauses.
+ Use the honor_time argument to control whether the requested pause
+ is honoured (with a time.sleep()) or skipped in favour of immediate
+ redirection.
+
+ """
+ processor_order = 1000
+
+ def __init__(self, max_time=0, honor_time=True):
+ self.max_time = max_time
+ self.honor_time = honor_time
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code == 200 and hdrs.has_key("refresh"):
+ refresh = hdrs["refresh"]
+ i = string.find(refresh, ";")
+ if i != -1:
+ pause, newurl_spec = refresh[:i], refresh[i+1:]
+ i = string.find(newurl_spec, "=")
+ if i != -1:
+ pause = int(pause)
+ if pause <= self.max_time:
+ if pause != 0 and self.honor_time:
+ time.sleep(pause)
+ newurl = newurl_spec[i+1:]
+ # fake a 302 response
+ hdrs["location"] = newurl
+ response = self.parent.error(
+ 'http', request, response, 302, msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+ class HTTPErrorProcessor(BaseProcessor):
+ """Process non-200 HTTP error responses.
+
+ This just passes the job on to the Handler.<proto>_error_<code>
+ methods, via the OpenerDirector.error method.
+
+ """
+ processor_order = 1000
+
+ def http_response(self, request, response):
+ code, msg, hdrs = response.code, response.msg, response.info()
+
+ if code != 200:
+ response = self.parent.error(
+ 'http', request, response, code, msg, hdrs)
+
+ return response
+
+ https_response = http_response
+
+
+ class OpenerDirector(urllib2.OpenerDirector):
+ # XXX might be useful to have remove_processor, too (say you want to
+ # set a new RefererProcessor, but keep the old CookieProcessor --
+ # could always just create everything anew, though (using old
+ # CookieJar object to create CookieProcessor)
+ def __init__(self):
+ urllib2.OpenerDirector.__init__(self)
+ #self.processors = []
+ self.process_response = {}
+ self.process_request = {}
+
+ def add_handler(self, handler):
+ # XXX
+ # tidy me
+ # the same handler could be added twice without detection
+ added = 0
+ for meth in dir(handler.__class__):
+ if meth[-5:] == '_open':
+ protocol = meth[:-5]
+ if self.handle_open.has_key(protocol):
+ self.handle_open[protocol].append(handler)
+ self.handle_open[protocol].sort()
+ else:
+ self.handle_open[protocol] = [handler]
+ added = 1
+ continue
+ i = string.find(meth, '_')
+ j = string.find(meth[i+1:], '_') + i + 1
+ if j != -1 and meth[i+1:j] == 'error':
+ proto = meth[:i]
+ kind = meth[j+1:]
+ try:
+ kind = int(kind)
+ except ValueError:
+ pass
+ dict = self.handle_error.get(proto, {})
+ if dict.has_key(kind):
+ dict[kind].append(handler)
+ dict[kind].sort()
+ else:
+ dict[kind] = [handler]
+ self.handle_error[proto] = dict
+ added = 1
+ continue
+ if meth[-9:] == "_response":
+ protocol = meth[:-9]
+ if self.process_response.has_key(protocol):
+ self.process_response[protocol].append(handler)
+ self.process_response[protocol].sort()
+ else:
+ self.process_response[protocol] = [handler]
+ added = True
+ continue
+ elif meth[-8:] == "_request":
+ protocol = meth[:-8]
+ if self.process_request.has_key(protocol):
+ self.process_request[protocol].append(handler)
+ self.process_request[protocol].sort()
+ else:
+ self.process_request[protocol] = [handler]
+ added = True
+ continue
+ if added:
+ self.handlers.append(handler)
+ self.handlers.sort()
+ handler.add_parent(self)
+
+## def add_processor(self, processor):
+## added = False
+## for meth in dir(processor):
+## if meth[-9:] == "_response":
+## protocol = meth[:-9]
+## if self.process_response.has_key(protocol):
+## self.process_response[protocol].append(processor)
+## self.process_response[protocol].sort()
+## else:
+## self.process_response[protocol] = [processor]
+## added = True
+## continue
+## elif meth[-8:] == "_request":
+## protocol = meth[:-8]
+## if self.process_request.has_key(protocol):
+## self.process_request[protocol].append(processor)
+## self.process_request[protocol].sort()
+## else:
+## self.process_request[protocol] = [processor]
+## added = True
+## continue
+## if added:
+## self.processors.append(processor)
+## # XXX base class sorts .handlers, but I have no idea why
+## #self.processors.sort()
+## processor.add_parent(self)
+
+ def _request(self, url_or_req, data):
+ if isstringlike(url_or_req):
+ req = Request(url_or_req, data)
+ else:
+ # already a urllib2.Request instance
+ req = url_or_req
+ if data is not None:
+ req.add_data(data)
+ return req
+
+ def open(self, fullurl, data=None):
+ req = self._request(fullurl, data)
+ type = req.get_type()
+
+ # pre-process request
+ # XXX should we allow a Processor to change the type (URL
+ # scheme) of the request?
+ meth_name = type+"_request"
+ for processor in self.process_request.get(type, []):
+ meth = getattr(processor, meth_name)
+ req = meth(req)
+
+ response = urllib2.OpenerDirector.open(self, req, data)
+
+ # post-process response
+ meth_name = type+"_response"
+ for processor in self.process_response.get(type, []):
+ meth = getattr(processor, meth_name)
+ response = meth(req, response)
+
+ return response
+
+## def close(self):
+## urllib2.OpenerDirector.close(self)
+## for processor in self.processors:
+## processor.close()
+## self.processors = []
+
+
+ # Note the absence of redirect and header-adding code here
+ # (AbstractHTTPHandler), and the lack of other clutter that would be
+ # here without Processors.
+ class AbstractHTTPHandler(urllib2.BaseHandler):
+ def do_open(self, http_class, req):
+ host = req.get_host()
+ if not host:
+ raise URLError('no host given')
+
+ h = http_class(host) # will parse host:port
+ if ClientCookie.HTTP_DEBUG:
+ h.set_debuglevel(1)
+
+ if req.has_data():
+ h.putrequest('POST', req.get_selector())
+ else:
+ h.putrequest('GET', req.get_selector())
+
+ for k, v in req.headers.items():
+ h.putheader(k, v)
+ for k, v in req.unredirected_hdrs.items():
+ h.putheader(k, v)
+
+ # httplib will attempt to connect() here. be prepared
+ # to convert a socket error to a URLError.
+ try:
+ h.endheaders()
+ except socket.error, err:
+ raise URLError(err)
+ if req.has_data():
+ h.send(req.get_data())
+
+ code, msg, hdrs = h.getreply()
+ fp = h.getfile()
+
+ response = urllib.addinfourl(fp, hdrs, req.get_full_url())
+ response.code = code
+ response.msg = msg
+
+ return response
+
+
+ # XXX would self.reset() work, instead of raising this exception?
+ class EndOfHeadError(Exception): pass
+ class HeadParser(htmllib.HTMLParser):
+ # only these elements are allowed in or before HEAD of document
+ head_elems = ("html", "head",
+ "title", "base",
+ "script", "style", "meta", "link", "object")
+ def __init__(self):
+ htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
+ self.http_equiv = []
+
+ def start_meta(self, attrs):
+ http_equiv = content = None
+ for key, value in attrs:
+ if key == "http-equiv":
+ http_equiv = value
+ elif key == "content":
+ content = value
+ if http_equiv is not None:
+ self.http_equiv.append((http_equiv, content))
+
+ def handle_starttag(self, tag, method, attrs):
+ if tag in self.head_elems:
+ method(attrs)
+ else:
+ raise EndOfHeadError()
+
+ def handle_endtag(self, tag, method):
+ if tag in self.head_elems:
+ method()
+ else:
+ raise EndOfHeadError()
+
+ def end_head(self):
+ raise EndOfHeadError()
+
+ def parse_head(file):
+ """Return a list of key, value pairs."""
+ hp = HeadParser()
+ while 1:
+ data = file.read(CHUNK)
+ try:
+ hp.feed(data)
+ except EndOfHeadError:
+ break
+ if len(data) != CHUNK:
+ # this should only happen if there is no HTML body, or if
+ # CHUNK is big
+ break
+ return hp.http_equiv
+
+
+ class HTTPHandler(AbstractHTTPHandler):
+ def http_open(self, req):
+ return self.do_open(httplib.HTTP, req)
+
+ if hasattr(httplib, 'HTTPS'):
+ class HTTPSHandler(AbstractHTTPHandler):
+ def https_open(self, req):
+ return self.do_open(httplib.HTTPS, req)
+
+
+ def build_opener(*handlers):
+ """Create an opener object from a list of handlers and processors.
+
+ The opener will use several default handlers and processors, including
+ support for HTTP and FTP. If there is a ProxyHandler, it must be at the
+ front of the list of handlers. (Yuck. This is fixed in 2.3.)
+
+ If any of the handlers passed as arguments are subclasses of the
+ default handlers, the default handlers will not be used.
+ """
+ opener = OpenerDirector()
+ default_classes = [
+ # handlers
+ urllib2.ProxyHandler,
+ urllib2.UnknownHandler,
+ HTTPHandler, # from this module (derived from new AbstractHTTPHandler)
+ urllib2.HTTPDefaultErrorHandler,
+ HTTPRedirectHandler, # from this module (bugfixed)
+ urllib2.FTPHandler,
+ urllib2.FileHandler,
+ # processors
+ HTTPRequestUpgradeProcessor,
+ #HTTPEquivProcessor,
+ #SeekableProcessor,
+ HTTPCookieProcessor,
+ #HTTPRefererProcessor,
+ HTTPStandardHeadersProcessor,
+ #HTTPRefreshProcessor,
+ HTTPErrorProcessor
+ ]
+ if hasattr(httplib, 'HTTPS'):
+ default_classes.append(HTTPSHandler)
+ skip = []
+ for klass in default_classes:
+ for check in handlers:
+ if type(check) == types.ClassType:
+ if issubclass(check, klass):
+ skip.append(klass)
+ elif type(check) == types.InstanceType:
+ if isinstance(check, klass):
+ skip.append(klass)
+ for klass in skip:
+ default_classes.remove(klass)
+
+ to_add = []
+ for klass in default_classes:
+ to_add.append(klass())
+ for h in handlers:
+ if type(h) == types.ClassType:
+ h = h()
+ to_add.append(h)
+
+ for instance in to_add:
+ opener.add_handler(instance)
+## # yuck
+## if hasattr(instance, "processor_order"):
+## opener.add_processor(instance)
+## else:
+## opener.add_handler(instance)
+
+ return opener
+
+
+ _opener = None
+ urlopen_lock = _threading.Lock()
+ def urlopen(url, data=None):
+ global _opener
+ if _opener is None:
+ urlopen_lock.acquire()
+ try:
+ if _opener is None:
+ _opener = build_opener()
+ finally:
+ urlopen_lock.release()
+ return _opener.open(url, data)
+
+ def install_opener(opener):
+ global _opener
+ _opener = opener