diff options
Diffstat (limited to 'tools/bug_tool/ClientCookie/_urllib2_support.py')
-rw-r--r-- | tools/bug_tool/ClientCookie/_urllib2_support.py | 713 |
1 files changed, 0 insertions, 713 deletions
diff --git a/tools/bug_tool/ClientCookie/_urllib2_support.py b/tools/bug_tool/ClientCookie/_urllib2_support.py deleted file mode 100644 index d767d08b25..0000000000 --- a/tools/bug_tool/ClientCookie/_urllib2_support.py +++ /dev/null @@ -1,713 +0,0 @@ -"""Integration with Python standard library module urllib2. - -Also includes a redirection bugfix, support for parsing HTML HEAD blocks for -the META HTTP-EQUIV tag contents, and following Refresh header redirects. - -Copyright 2002-2003 John J Lee <jjl@pobox.com> - -This code is free software; you can redistribute it and/or modify it under -the terms of the BSD License (see the file COPYING included with the -distribution). - -""" - -import copy, time - -import ClientCookie -from _ClientCookie import CookieJar, request_host -from _Util import isstringlike -from _Debug import _debug - -try: True -except NameError: - True = 1 - False = 0 - -CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes - -try: - from urllib2 import AbstractHTTPHandler -except ImportError: - pass -else: - import urlparse, urllib2, urllib, httplib, htmllib, formatter, string - from urllib2 import URLError, HTTPError - import types, string, socket - from cStringIO import StringIO - from _Util import seek_wrapper - try: - import threading - _threading = threading; del threading - except ImportError: - import dummy_threading - _threading = dummy_threading; del dummy_threading - - # This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2 - # (http://www.python.org/sf/549151) - # 2.2.3 is broken here (my fault!), 2.3 is fixed. - class HTTPRedirectHandler(urllib2.BaseHandler): - # maximum number of redirections before assuming we're in a loop - max_redirections = 10 - - # Implementation notes: - - # To avoid the server sending us into an infinite loop, the request - # object needs to track what URLs we have already seen. Do this by - # adding a handler-specific attribute to the Request object. The value - # of the dict is used to count the number of times the same url has - # been visited. This is needed because this isn't necessarily a loop: - # there is more than one way to redirect (Refresh, 302, 303, 307). - - # Another handler-specific Request attribute, original_url, is used to - # remember the URL of the original request so that it is possible to - # decide whether or not RFC 2965 cookies should be turned on during - # redirect. - - # Always unhandled redirection codes: - # 300 Multiple Choices: should not handle this here. - # 304 Not Modified: no need to handle here: only of interest to caches - # that do conditional GETs - # 305 Use Proxy: probably not worth dealing with here - # 306 Unused: what was this for in the previous versions of protocol?? - - def redirect_request(self, newurl, req, fp, code, msg, headers): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a redirection - response is received. If a redirection should take place, return a - new Request to allow http_error_30x to perform the redirect; - otherwise, return None to indicate that an HTTPError should be - raised. - - """ - if code in (301, 302, 303) or (code == 307 and not req.has_data()): - # Strictly (according to RFC 2616), 301 or 302 in response to - # a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib2, in this case). In practice, - # essentially all clients do redirect in this case, so we do - # the same. - return Request(newurl, headers=req.headers) - else: - raise HTTPError(req.get_full_url(), code, msg, headers, fp) - - def http_error_302(self, req, fp, code, msg, headers): - if headers.has_key('location'): - newurl = headers['location'] - elif headers.has_key('uri'): - newurl = headers['uri'] - else: - return - newurl = urlparse.urljoin(req.get_full_url(), newurl) - - # XXX Probably want to forget about the state of the current - # request, although that might interact poorly with other - # handlers that also use handler-specific request attributes - new = self.redirect_request(newurl, req, fp, code, msg, headers) - if new is None: - return - - # remember where we started from - if hasattr(req, "original_url"): - new.original_url = req.original_url - else: - new.original_url = req.get_full_url() - - # loop detection - # .error_302_dict[(url, code)] is number of times url - # previously visited as a result of a redirection with this - # code (error_30x_dict would be a better name). - new.origin_req_host = req.origin_req_host - if not hasattr(req, 'error_302_dict'): - new.error_302_dict = req.error_302_dict = {(newurl, code): 1} - else: - ed = new.error_302_dict = req.error_302_dict - nr_visits = ed.get((newurl, code), 0) - # Refreshes generate fake 302s, so we can hit the same URL as - # a result of the same redirection code twice without - # necessarily being in a loop! So, allow two visits to each - # URL as a result of each redirection code. - if len(ed) < self.max_redirections and nr_visits < 2: - ed[(newurl, code)] = nr_visits + 1 - else: - raise HTTPError(req.get_full_url(), code, - self.inf_msg + msg, headers, fp) - - if ClientCookie.REDIRECT_DEBUG: - _debug("redirecting to %s", newurl) - - # Don't close the fp until we are sure that we won't use it - # with HTTPError. - fp.read() - fp.close() - - return self.parent.open(new) - - http_error_301 = http_error_303 = http_error_307 = http_error_302 - - inf_msg = "The HTTP server returned a redirect error that would " \ - "lead to an infinite loop.\n" \ - "The last 30x error message was:\n" - - - class Request(urllib2.Request): - def __init__(self, url, data=None, headers={}): - urllib2.Request.__init__(self, url, data, headers) - self.unredirected_hdrs = {} - - def add_unredirected_header(self, key, val): - # these headers do not persist from one request to the next in a chain - # of requests - self.unredirected_hdrs[string.capitalize(key)] = val - - def has_key(self, header_name): - if (self.headers.has_key(header_name) or - self.unredirected_hdrs.has_key(header_name)): - return True - return False - - def get(self, header_name, failobj=None): - if self.headers.has_key(header_name): - return self.headers[header_name] - if self.unredirected_headers.has_key(header_name): - return self.unredirected_headers[header_name] - return failobj - - - class BaseProcessor: - processor_order = 500 - - def add_parent(self, parent): - self.parent = parent - def close(self): - self.parent = None - def __lt__(self, other): - if not hasattr(other, "processor_order"): - return True - return self.processor_order < other.processor_order - - class HTTPRequestUpgradeProcessor(BaseProcessor): - # upgrade Request to class with support for headers that don't get - # redirected - processor_order = 0 # before anything else - - def http_request(self, request): - if not hasattr(request, "add_unredirected_header"): - request = Request(request._Request__original, request.data, - request.headers) - return request - - https_request = http_request - - class HTTPEquivProcessor(BaseProcessor): - """Append META HTTP-EQUIV headers to regular HTTP headers.""" - def http_response(self, request, response): - if not hasattr(response, "seek"): - response = seek_wrapper(response) - # grab HTTP-EQUIV headers and add them to the true HTTP headers - headers = response.info() - for hdr, val in parse_head(response): - headers[hdr] = val - response.seek(0) - return response - - https_response = http_response - - # XXX ATM this only takes notice of http responses -- probably - # should be independent of protocol scheme (http, ftp, etc.) - class SeekableProcessor(BaseProcessor): - """Make responses seekable.""" - - def http_response(self, request, response): - if not hasattr(response, "seek"): - return seek_wrapper(response) - return response - - https_response = http_response - - # XXX if this gets added to urllib2, unverifiable would end up as an - # attribute on Request. - class HTTPCookieProcessor(BaseProcessor): - """Handle HTTP cookies.""" - def __init__(self, cookies=None): - if cookies is None: - cookies = CookieJar() - self.cookies = cookies - - def _unverifiable(self, request): - if hasattr(request, "error_302_dict") and request.error_302_dict: - redirect = True - else: - redirect = False - if (redirect or - (hasattr(request, "unverifiable") and request.unverifiable)): - unverifiable = True - else: - unverifiable = False - return unverifiable - - def http_request(self, request): - unverifiable = self._unverifiable(request) - if not unverifiable: - # Stuff request-host of this origin transaction into Request - # object, because we need to know it to know whether cookies - # should be in operation during derived requests (redirects, - # specifically -- including refreshes). - request.origin_req_host = request_host(request) - self.cookies.add_cookie_header(request, unverifiable) - return request - - def http_response(self, request, response): - unverifiable = self._unverifiable(request) - self.cookies.extract_cookies(response, request, unverifiable) - return response - - https_request = http_request - https_response = http_response - - class HTTPRefererProcessor(BaseProcessor): - """Add Referer header to requests. - - This only makes sense if you use each RefererProcessor for a single - chain of requests only (so, for example, if you use a single - HTTPRefererProcessor to fetch a series of URLs extracted from a single - page, this will break). - - """ - def __init__(self): - self.referer = None - - def http_request(self, request): - if ((self.referer is not None) and - not request.has_key("Referer")): - request.add_unredirected_header("Referer", self.referer) - return request - - def http_response(self, request, response): - self.referer = response.geturl() - return response - - https_request = http_request - https_response = http_response - - class HTTPStandardHeadersProcessor(BaseProcessor): - def http_request(self, request): - host = request.get_host() - if not host: - raise URLError('no host given') - - if request.has_data(): # POST - data = request.get_data() - if not request.has_key('Content-type'): - request.add_unredirected_header( - 'Content-type', - 'application/x-www-form-urlencoded') - if not request.has_key('Content-length'): - request.add_unredirected_header( - 'Content-length', '%d' % len(data)) - - scheme, sel = urllib.splittype(request.get_selector()) - sel_host, sel_path = urllib.splithost(sel) - if not request.has_key('Host'): - request.add_unredirected_header('Host', sel_host or host) - for name, value in self.parent.addheaders: - name = string.capitalize(name) - if not request.has_key(name): - request.add_unredirected_header(name, value) - - return request - - https_request = http_request - - class HTTPResponseDebugProcessor(BaseProcessor): - processor_order = 900 # before redirections, after everything else - - def http_response(self, request, response): - if not hasattr(response, "seek"): - response = seek_wrapper(response) - _debug(response.read()) - _debug("*****************************************************") - response.seek(0) - return response - - https_response = http_response - - class HTTPRefreshProcessor(BaseProcessor): - """Perform HTTP Refresh redirections. - - Note that if a non-200 HTTP code has occurred (for example, a 30x - redirect), this processor will do nothing. - - By default, only zero-time Refresh headers are redirected. Use the - max_time constructor argument to allow Refresh with longer pauses. - Use the honor_time argument to control whether the requested pause - is honoured (with a time.sleep()) or skipped in favour of immediate - redirection. - - """ - processor_order = 1000 - - def __init__(self, max_time=0, honor_time=True): - self.max_time = max_time - self.honor_time = honor_time - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - if code == 200 and hdrs.has_key("refresh"): - refresh = hdrs["refresh"] - i = string.find(refresh, ";") - if i != -1: - pause, newurl_spec = refresh[:i], refresh[i+1:] - i = string.find(newurl_spec, "=") - if i != -1: - pause = int(pause) - if pause <= self.max_time: - if pause != 0 and self.honor_time: - time.sleep(pause) - newurl = newurl_spec[i+1:] - # fake a 302 response - hdrs["location"] = newurl - response = self.parent.error( - 'http', request, response, 302, msg, hdrs) - - return response - - https_response = http_response - - class HTTPErrorProcessor(BaseProcessor): - """Process non-200 HTTP error responses. - - This just passes the job on to the Handler.<proto>_error_<code> - methods, via the OpenerDirector.error method. - - """ - processor_order = 1000 - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - if code != 200: - response = self.parent.error( - 'http', request, response, code, msg, hdrs) - - return response - - https_response = http_response - - - class OpenerDirector(urllib2.OpenerDirector): - # XXX might be useful to have remove_processor, too (say you want to - # set a new RefererProcessor, but keep the old CookieProcessor -- - # could always just create everything anew, though (using old - # CookieJar object to create CookieProcessor) - def __init__(self): - urllib2.OpenerDirector.__init__(self) - #self.processors = [] - self.process_response = {} - self.process_request = {} - - def add_handler(self, handler): - # XXX - # tidy me - # the same handler could be added twice without detection - added = 0 - for meth in dir(handler.__class__): - if meth[-5:] == '_open': - protocol = meth[:-5] - if self.handle_open.has_key(protocol): - self.handle_open[protocol].append(handler) - self.handle_open[protocol].sort() - else: - self.handle_open[protocol] = [handler] - added = 1 - continue - i = string.find(meth, '_') - j = string.find(meth[i+1:], '_') + i + 1 - if j != -1 and meth[i+1:j] == 'error': - proto = meth[:i] - kind = meth[j+1:] - try: - kind = int(kind) - except ValueError: - pass - dict = self.handle_error.get(proto, {}) - if dict.has_key(kind): - dict[kind].append(handler) - dict[kind].sort() - else: - dict[kind] = [handler] - self.handle_error[proto] = dict - added = 1 - continue - if meth[-9:] == "_response": - protocol = meth[:-9] - if self.process_response.has_key(protocol): - self.process_response[protocol].append(handler) - self.process_response[protocol].sort() - else: - self.process_response[protocol] = [handler] - added = True - continue - elif meth[-8:] == "_request": - protocol = meth[:-8] - if self.process_request.has_key(protocol): - self.process_request[protocol].append(handler) - self.process_request[protocol].sort() - else: - self.process_request[protocol] = [handler] - added = True - continue - if added: - self.handlers.append(handler) - self.handlers.sort() - handler.add_parent(self) - -## def add_processor(self, processor): -## added = False -## for meth in dir(processor): -## if meth[-9:] == "_response": -## protocol = meth[:-9] -## if self.process_response.has_key(protocol): -## self.process_response[protocol].append(processor) -## self.process_response[protocol].sort() -## else: -## self.process_response[protocol] = [processor] -## added = True -## continue -## elif meth[-8:] == "_request": -## protocol = meth[:-8] -## if self.process_request.has_key(protocol): -## self.process_request[protocol].append(processor) -## self.process_request[protocol].sort() -## else: -## self.process_request[protocol] = [processor] -## added = True -## continue -## if added: -## self.processors.append(processor) -## # XXX base class sorts .handlers, but I have no idea why -## #self.processors.sort() -## processor.add_parent(self) - - def _request(self, url_or_req, data): - if isstringlike(url_or_req): - req = Request(url_or_req, data) - else: - # already a urllib2.Request instance - req = url_or_req - if data is not None: - req.add_data(data) - return req - - def open(self, fullurl, data=None): - req = self._request(fullurl, data) - type = req.get_type() - - # pre-process request - # XXX should we allow a Processor to change the type (URL - # scheme) of the request? - meth_name = type+"_request" - for processor in self.process_request.get(type, []): - meth = getattr(processor, meth_name) - req = meth(req) - - response = urllib2.OpenerDirector.open(self, req, data) - - # post-process response - meth_name = type+"_response" - for processor in self.process_response.get(type, []): - meth = getattr(processor, meth_name) - response = meth(req, response) - - return response - -## def close(self): -## urllib2.OpenerDirector.close(self) -## for processor in self.processors: -## processor.close() -## self.processors = [] - - - # Note the absence of redirect and header-adding code here - # (AbstractHTTPHandler), and the lack of other clutter that would be - # here without Processors. - class AbstractHTTPHandler(urllib2.BaseHandler): - def do_open(self, http_class, req): - host = req.get_host() - if not host: - raise URLError('no host given') - - h = http_class(host) # will parse host:port - if ClientCookie.HTTP_DEBUG: - h.set_debuglevel(1) - - if req.has_data(): - h.putrequest('POST', req.get_selector()) - else: - h.putrequest('GET', req.get_selector()) - - for k, v in req.headers.items(): - h.putheader(k, v) - for k, v in req.unredirected_hdrs.items(): - h.putheader(k, v) - - # httplib will attempt to connect() here. be prepared - # to convert a socket error to a URLError. - try: - h.endheaders() - except socket.error, err: - raise URLError(err) - if req.has_data(): - h.send(req.get_data()) - - code, msg, hdrs = h.getreply() - fp = h.getfile() - - response = urllib.addinfourl(fp, hdrs, req.get_full_url()) - response.code = code - response.msg = msg - - return response - - - # XXX would self.reset() work, instead of raising this exception? - class EndOfHeadError(Exception): pass - class HeadParser(htmllib.HTMLParser): - # only these elements are allowed in or before HEAD of document - head_elems = ("html", "head", - "title", "base", - "script", "style", "meta", "link", "object") - def __init__(self): - htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) - self.http_equiv = [] - - def start_meta(self, attrs): - http_equiv = content = None - for key, value in attrs: - if key == "http-equiv": - http_equiv = value - elif key == "content": - content = value - if http_equiv is not None: - self.http_equiv.append((http_equiv, content)) - - def handle_starttag(self, tag, method, attrs): - if tag in self.head_elems: - method(attrs) - else: - raise EndOfHeadError() - - def handle_endtag(self, tag, method): - if tag in self.head_elems: - method() - else: - raise EndOfHeadError() - - def end_head(self): - raise EndOfHeadError() - - def parse_head(file): - """Return a list of key, value pairs.""" - hp = HeadParser() - while 1: - data = file.read(CHUNK) - try: - hp.feed(data) - except EndOfHeadError: - break - if len(data) != CHUNK: - # this should only happen if there is no HTML body, or if - # CHUNK is big - break - return hp.http_equiv - - - class HTTPHandler(AbstractHTTPHandler): - def http_open(self, req): - return self.do_open(httplib.HTTP, req) - - if hasattr(httplib, 'HTTPS'): - class HTTPSHandler(AbstractHTTPHandler): - def https_open(self, req): - return self.do_open(httplib.HTTPS, req) - - - def build_opener(*handlers): - """Create an opener object from a list of handlers and processors. - - The opener will use several default handlers and processors, including - support for HTTP and FTP. If there is a ProxyHandler, it must be at the - front of the list of handlers. (Yuck. This is fixed in 2.3.) - - If any of the handlers passed as arguments are subclasses of the - default handlers, the default handlers will not be used. - """ - opener = OpenerDirector() - default_classes = [ - # handlers - urllib2.ProxyHandler, - urllib2.UnknownHandler, - HTTPHandler, # from this module (derived from new AbstractHTTPHandler) - urllib2.HTTPDefaultErrorHandler, - HTTPRedirectHandler, # from this module (bugfixed) - urllib2.FTPHandler, - urllib2.FileHandler, - # processors - HTTPRequestUpgradeProcessor, - #HTTPEquivProcessor, - #SeekableProcessor, - HTTPCookieProcessor, - #HTTPRefererProcessor, - HTTPStandardHeadersProcessor, - #HTTPRefreshProcessor, - HTTPErrorProcessor - ] - if hasattr(httplib, 'HTTPS'): - default_classes.append(HTTPSHandler) - skip = [] - for klass in default_classes: - for check in handlers: - if type(check) == types.ClassType: - if issubclass(check, klass): - skip.append(klass) - elif type(check) == types.InstanceType: - if isinstance(check, klass): - skip.append(klass) - for klass in skip: - default_classes.remove(klass) - - to_add = [] - for klass in default_classes: - to_add.append(klass()) - for h in handlers: - if type(h) == types.ClassType: - h = h() - to_add.append(h) - - for instance in to_add: - opener.add_handler(instance) -## # yuck -## if hasattr(instance, "processor_order"): -## opener.add_processor(instance) -## else: -## opener.add_handler(instance) - - return opener - - - _opener = None - urlopen_lock = _threading.Lock() - def urlopen(url, data=None): - global _opener - if _opener is None: - urlopen_lock.acquire() - try: - if _opener is None: - _opener = build_opener() - finally: - urlopen_lock.release() - return _opener.open(url, data) - - def install_opener(opener): - global _opener - _opener = opener |