diff options
author | Taybin Rutkin <taybin@taybin.com> | 2005-05-13 20:47:18 +0000 |
---|---|---|
committer | Taybin Rutkin <taybin@taybin.com> | 2005-05-13 20:47:18 +0000 |
commit | d09f6b3016bacbc2871a8946cbb24ad705076509 (patch) | |
tree | f27312839c2a772cb2ce068a4f28b2449ad869df /tools/bug_tool/ClientCookie |
Initial revision
git-svn-id: svn://localhost/trunk/ardour2@4 d708f5d6-7413-0410-9779-e7cbd77b26cf
Diffstat (limited to 'tools/bug_tool/ClientCookie')
-rw-r--r-- | tools/bug_tool/ClientCookie/.cvsignore | 3 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_ClientCookie.py | 1833 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_Debug.py | 9 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_HeadersUtil.py | 224 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_MSIECookieJar.py | 377 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_MozillaCookieJar.py | 171 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_Util.py | 459 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/__init__.py | 49 | ||||
-rw-r--r-- | tools/bug_tool/ClientCookie/_urllib2_support.py | 713 |
9 files changed, 3838 insertions, 0 deletions
diff --git a/tools/bug_tool/ClientCookie/.cvsignore b/tools/bug_tool/ClientCookie/.cvsignore new file mode 100644 index 0000000000..c53e20660e --- /dev/null +++ b/tools/bug_tool/ClientCookie/.cvsignore @@ -0,0 +1,3 @@ +*.pyc +Makefile.in +Makefile diff --git a/tools/bug_tool/ClientCookie/_ClientCookie.py b/tools/bug_tool/ClientCookie/_ClientCookie.py new file mode 100644 index 0000000000..307fa22afb --- /dev/null +++ b/tools/bug_tool/ClientCookie/_ClientCookie.py @@ -0,0 +1,1833 @@ +"""HTTP cookie handling for web clients, plus some other stuff. + +This module originally developed from my port of Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + +Comments to John J Lee <jjl@pobox.com>. + + +Copyright 2002-2003 John J Lee <jjl@pobox.com> +Copyright 1997-1999 Gisle Aas (original libwww-perl code) +Copyright 2002-2003 Johnny Lee (original MSIE Perl code) + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD License (see the file COPYING included with the +distribution). + +""" + +VERSION = "0.4.9" + + +# Public health warning: anyone who thought 'cookies are simple, aren't they?', +# run away now :-( + +import sys, re, urlparse, string, copy, time, struct +try: + import threading + _threading = threading; del threading +except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading +import httplib # only for the default HTTP port + +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") +DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) + +try: True +except NameError: + True = 1 + False = 0 + +try: StopIteration +except NameError: + class StopIteration(Exception): pass + +import ClientCookie +from _HeadersUtil import split_header_words, join_header_words, \ + parse_ns_headers +from _Util import startswith, endswith, iso2time, time2isoz +from _Debug import debug + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +try: issubclass(Exception, (Exception,)) +except TypeError: + real_issubclass = issubclass + from _Util import compat_issubclass + issubclass = compat_issubclass + del compat_issubclass + +SPACE_DICT = {} +for c in string.whitespace: + SPACE_DICT[c] = None +del c +def isspace(string): + for c in string: + if not SPACE_DICT.has_key(c): return False + return True + +def getheaders(msg, name): + """Get all values for a header. + + This returns a list of values for headers given more than once; each + value in the result list is stripped in the same way as the result of + getheader(). If the header is not given, return an empty list. + """ + result = [] + current = '' + have_header = 0 + for s in msg.getallmatchingheaders(name): + if isspace(s[0]): + if current: + current = "%s\n %s" % (current, string.strip(s)) + else: + current = string.strip(s) + else: + if have_header: + result.append(current) + current = string.strip(s[string.find(s, ":") + 1:]) + have_header = 1 + if have_header: + result.append(current) + return result + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + if ClientCookie.CLIENTCOOKIE_DEBUG: + raise + unmasked = unmasked + (KeyboardInterrupt, SystemExit) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + if IPV4_RE.search(text): + return False + if text == "": + return False + if text[0] == "." or text[-1] == ".": + return False + return True + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = string.lower(A) + B = string.lower(B) + if A == B: + return True + if not is_HDN(A): + return False + i = string.rfind(A, B) + if i == -1 or i == 0: + # A does not have form NB, or N is the empty string + return False + if not startswith(B, "."): + return False + if not is_HDN(B[1:]): + return False + return True + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + if IPV4_RE.search(text): + return False + return True + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = string.lower(A) + B = string.lower(B) + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = startswith(B, ".") + if initial_dot and endswith(A, B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urlparse.urlparse(url)[1] + if host == "": + host = request.headers.get("Host", "") + + # remove port, if present + host = cut_port_re.sub("", host, 1) + return string.lower(host) + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name). + + As defined by RFC 2965, except both are lowercased. + + """ + erhn = req_host = request_host(request) + if string.find(req_host, ".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def request_path(request): + """request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url) + req_path = normalize_path(string.join(urlparse.urlparse(url)[2:], "")) + if not startswith(req_path, "/"): + # fix bad RFC 2396 absoluteURI + req_path = "/"+req_path + return req_path + +def request_port(request): + # ATM (Python 2.3) request.port is always None, and unused by urllib2 + port = request.port + host = request.get_host() + if port is None: + i = string.find(host, ':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'" % port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +def unescape_path_fn(match): + x = string.upper(match.group(1)) + if x == "2F" or x == "25": + return "%%%s" % (x,) + else: + # string.atoi deprecated in 2.0, but 1.5.2 int function won't do + # radix conversion + return struct.pack("B", string.atoi(x, 16)) +def normalize_path_fn(match): + return "%%%02X" % ord(match.group(1)) + +unescape_re = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +normalize_re = re.compile(r"([\0-\x20\x7f-\xff])") +def normalize_path(path): + """Normalise URI path so that plain string compare can be used. + + >>> normalize_path("%19\xd3%Fb%2F%25%26") + '%19%D3%FB%2F%25&' + >>> + + In normalised form, all non-printable characters are %-escaped, and all + printable characters are given literally (not escaped). All remaining + %-escaped characters are capitalised. %25 and %2F are special-cased, + because they represent the printable characters"%" and "/", which are used + as escape and URI path separator characters respectively. + + """ + path = unescape_re.sub(unescape_path_fn, path) + path = normalize_re.sub(normalize_path_fn, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = string.find(h, ".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = string.find(b, ".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = string.lower(request_host(request)) + # the origin request's request-host was stuffed into request by + # _urllib2_support.AbstractHTTPHandler + if not domain_match(req_host, reach(request.origin_req_host)): + return True + else: + return False + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + version: integer; + name: string (may be None); + value: string; + port: string; None indicates no attribute was supplied (eg. "Port", rather + than eg. "Port=80"); otherwise, a port string (eg. "80") or a port list + string (eg. "80,8080") + port_specified: boolean; true if a value was supplied with the Port + cookie-attribute + domain: string; + domain_specified: boolean; true if Domain was explicitly set + domain_initial_dot: boolean; true if Domain as set in HTTP header by server + started with a dot (yes, this really is necessary!) + path: string; + path_specified: boolean; true if Path was explicitly set + secure: boolean; true if should only be returned over secure connection + expires: integer; seconds since epoch (RFC 2965 cookies should calculate + this value from the Max-Age attribute) + discard: boolean, true if this is a session cookie; (if no expires value, + this should be true) + comment: string; + comment_url: string; + rest: mapping of other attributes + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = string.lower(domain) + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + + self.rest = copy.copy(rest) + + def is_expired(self, now=None): + if now is None: now = time.time() + if (self.expires is not None) and (self.expires <= now): + return True + return False + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.name is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.value + return "<Cookie %s for %s>" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url"]: + attr = getattr(self, name) + args.append("%s=%s" % (name, attr)) + args.append(repr(self.rest)) + return "Cookie(%s)" % string.join(args, ", ") + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + As well as implementing set_ok and return_ok, implementations of this + interface must also supply the following attributes, indicating which + protocols should be used, and how. These can be read and set at any time, + though whether that makes complete sense from the protocol point of view is + doubtful. + + Public attributes: + + netscape: implement netscape protocol + rfc2965: implement RFC 2965 protocol + hide_cookie2: don't add Cookie2 header to requests (the presence of + this header indicates to the server that we understand RFC 2965 + cookies) + + """ + def set_ok(self, cookie, request, unverifiable): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + cookie: ClientCookie.Cookie object + request: object implementing the interface defined by + CookieJar.extract_cookies.__doc__ + unverifiable: flag indicating whether the transaction is unverifiable, + as defined by RFC 2965 + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request, unverifiable): + """Return true if (and only if) cookie should be returned to server. + + cookie: ClientCookie.Cookie object + request: object implementing the interface defined by + CookieJar.add_cookie_header.__doc__ + unverifiable: flag indicating whether the transaction is unverifiable, + as defined by RFC 2965 + + """ + raise NotImplementedError() + + def domain_return_ok(self, domain, request, unverifiable): + """Return false if cookies should not be returned, given cookie domain. + + This is here as an optimization, to remove the need for checking every + cookie with a particular domain (which may involve reading many files). + The default implementations of domain_return_ok and path_return_ok + (return True) leave all the work to return_ok. + + If domain_return_ok returns true for the cookie domain, path_return_ok + is called for the cookie path. Otherwise, path_return_ok and return_ok + are never called for that cookie domain. If path_return_ok returns + true, return_ok is called with the Cookie object itself for a full + check. Otherwise, return_ok is never called for that cookie path. + + Note that domain_return_ok is called for every *cookie* domain, not + just for the *request* domain. For example, the function might be + called with both ".acme.com" and "www.acme.com" if the request domain is + "www.acme.com". The same goes for path_return_ok. + + For argument documentation, see the docstring for return_ok. + + """ + return True + + def path_return_ok(self, path, request, unverifiable): + """Return false if cookies should not be returned, given cookie path. + + See the docstring for domain_return_ok. + + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies. + + Both RFC 2965 and Netscape cookies are covered. + + The easiest way to provide your own policy is to override this class and + call its methods in your overriden implementations before adding your own + additional checks. + + import ClientCookie + class MyCookiePolicy(ClientCookie.DefaultCookiePolicy): + def set_ok(self, cookie, request, unverifiable): + if not ClientCookie.DefaultCookiePolicy.set_ok( + self, cookie, request, unverifiable): + return False + if i_dont_want_to_store_this_cookie(): + return False + return True + + In addition to the features required to implement the CookiePolicy + interface, this class allows you to block and allow domains from setting + and receiving cookies. There are also some strictness switches that allow + you to tighten up the rather loose Netscape protocol rules a little bit (at + the cost of blocking some benign cookies). + + A domain blacklist and whitelist is provided (both off by default). Only + domains not in the blacklist and present in the whitelist (if the whitelist + is active) participate in cookie setting and returning. Use the + blocked_domains constructor argument, and blocked_domains and + set_blocked_domains methods (and the corresponding argument and methods for + allowed_domains). If you set a whitelist, you can turn it off again by + setting it to None. + + Domains in block or allow lists that do not start with a dot must + string-compare equal. For example, "acme.com" matches a blacklist entry of + "acme.com", but "www.acme.com" does not. Domains that do start with a dot + are matched by more specific domains too. For example, both "www.acme.com" + and "www.munitions.acme.com" match ".acme.com" (but "acme.com" itself does + not). IP addresses are an exception, and must match exactly. For example, + if blocked_domains contains "192.168.1.2" and ".168.1.2" 192.168.1.2 is + blocked, but 193.168.1.2 is not. + + Additional Public Attributes: + + General strictness switches + + strict_domain: don't allow sites to set two-component domains with + country-code top-level domains like .co.uk, .gov.uk, .co.nz. etc. + This is far from perfect and isn't guaranteed to work! + + RFC 2965 protocol strictness switches + + strict_rfc2965_unverifiable: follow RFC 2965 rules on unverifiable + transactions (usually, an unverifiable transaction is one resulting from + a redirect or an image hosted on another site); if this is false, cookies + are NEVER blocked on the basis of verifiability + + Netscape protocol strictness switches + + strict_ns_unverifiable: apply RFC 2965 rules on unverifiable transactions + even to Netscape cookies + strict_ns_domain: flags indicating how strict to be with domain-matching + rules for Netscape cookies: + DomainStrictNoDots: when setting cookies, host prefix must not contain a + dot (eg. www.foo.bar.com can't set a cookie for .bar.com, because + www.foo contains a dot) + DomainStrictNonDomain: cookies that did not explicitly specify a Domain + cookie-attribute can only be returned to a domain that string-compares + equal to the domain that set the cookie (eg. rockets.acme.com won't + be returned cookies from acme.com that had no Domain cookie-attribute) + DomainRFC2965Match: when setting cookies, require a full RFC 2965 + domain-match + DomainLiberal and DomainStrict are the most useful combinations of the + above flags, for convenience + strict_ns_set_initial_dollar: ignore cookies in Set-Cookie: headers that + have names starting with '$' + strict_ns_set_path: don't allow setting cookies whose path doesn't + path-match request URI + + """ + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=True, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False): + """ + blocked_domains: sequence of domain names that we never accept cookies + from, nor return cookies to + allowed_domains: if not None, this is a sequence of the only domains + for which we accept and return cookies + + For other arguments, see CookiePolicy.__doc__ and + DefaultCookiePolicy.__doc__.. + + """ + self.netscape = netscape + self.rfc2965 = rfc2965 + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request, unverifiable): + """ + If you override set_ok, be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s=%s" % (cookie.name, cookie.value)) + + assert cookie.value is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request, unverifiable): + return False + return True + + def set_ok_version(self, cookie, request, unverifiable): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s=%s)" % + (cookie.name, cookie.value)) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request, unverifiable): + if unverifiable and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def set_ok_name(self, cookie, request, unverifiable): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + (cookie.name is not None) and startswith(cookie.name, "$")): + debug(" illegal name (starts with '$'): '%s'" % cookie.name) + return False + return True + + def set_ok_path(self, cookie, request, unverifiable): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not startswith(req_path, cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s" % (cookie.path, req_path)) + return False + return True + + def set_ok_domain(self, cookie, request, unverifiable): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list" % cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list" % cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host(request) + domain = cookie.domain + if self.strict_domain and (string.count(domain, ".") >= 2): + i = string.rfind(domain, ".") + j = string.rfind(domain, ".", 0, i) + if j == 0: # domain like .foo.bar + tld = domain[i+1:] + sld = domain[j+1:i] + if (string.lower(sld) in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int"] and + len(tld) == 2): + # domain like .co.uk + debug(" country-code second level domain %s" % + domain) + return False + if startswith(domain, "."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (string.find(undotted_domain, ".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot" % + domain) + return False + if cookie.version == 0: + if (not endswith(erhn, domain) and + (not startswith(erhn, ".") and + not endswith("."+erhn, domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s" % + (erhn, domain)) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s" % (erhn, domain)) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (string.find(host_prefix, ".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot" % + (host_prefix, domain)) + return False + return True + + def set_ok_port(self, cookie, request, unverifiable): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in string.split(cookie.port, ","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)" % p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s" % + (req_port, cookie.port)) + return False + return True + + def return_ok(self, cookie, request, unverifiable): + """ + If you override return_ok, be sure to call this method. If it returns + false, so should your subclass. + + """ + # Path has already been checked by path_return_ok, and domain blocking + # done by domain_return_ok. + debug(" - checking cookie %s=%s" % (cookie.name, cookie.value)) + + for n in "version", "verifiability", "secure", "expires", "port", "domain": + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request, unverifiable): + return False + return True + + def return_ok_version(self, cookie, request, unverifiable): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request, unverifiable): + if unverifiable and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request, unverifiable): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request, unverifiable): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request, unverifiable): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in string.split(cookie.port, ","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s" % + (req_port, cookie.port)) + return False + return True + + def return_ok_domain(self, cookie, request, unverifiable): + req_host, erhn = eff_request_host(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s" % (erhn, domain)) + return False + if cookie.version == 0 and not endswith("."+req_host, domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s" % (req_host, domain)) + return False + return True + + def domain_return_ok(self, domain, request, unverifiable): + if self.is_blocked(domain): + debug(" domain %s is in user block-list" % domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list" % domain) + return False + return True + + def path_return_ok(self, path, request, unverifiable): + debug("- checking cookie path=%s" % path) + req_path = request_path(request) + if not startswith(req_path, path): + debug(" %s does not path-match %s" % (req_path, path)) + return False + return True + + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in an the LWP cookie file format. + + Actually, the format is slightly extended from that used by LWP's + (libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 + information not recorded by LWP. + + Used by the CookieJar base class for saving cookies to a file. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + + keys = cookie.rest.keys() + keys.sort() + for k in keys: + h.append((k, str(cookie.rest[k]))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +class MappingIterator: + """Iterates over nested mapping, depth-first, in sorted order by key.""" + def __init__(self, mapping): + self._s = [(vals_sorted_by_key(mapping), 0, None)] # LIFO stack + + def __iter__(self): return self + + def next(self): + # this is hairy because of lack of generators + while 1: + try: + vals, i, prev_item = self._s.pop() + except IndexError: + raise StopIteration() + if i < len(vals): + item = vals[i] + i = i + 1 + self._s.append((vals, i, prev_item)) + try: + item.items + except AttributeError: + # non-mapping + break + else: + # mapping + self._s.append((vals_sorted_by_key(item), 0, item)) + continue + return item + + +# Used as second parameter to dict.get method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + The major methods are extract_cookies and add_cookie_header; these are all + you are likely to need. In fact, you probably don't even need to know + about this class: use the cookie-aware extensions to the urllib2 callables + provided by this module: urlopen in particular (and perhaps also + build_opener, install_opener, HTTPCookieProcessor, HTTPRefererProcessor, + HTTPRefreshHandler, HTTPEquivProcessor, SeekableProcessor, etc.). + + CookieJar supports the iterator protocol. Iteration also works in 1.5.2: + + for cookie in cookiejar: + # do something with cookie + + Methods: + + CookieJar(filename=None, delayload=False, policy=None) + add_cookie_header(request, unverifiable=False) + extract_cookies(response, request, unverifiable=False) + make_cookies(response, request) + set_cookie_if_ok(cookie, request, unverifiable=False) + set_cookie(cookie) + save(filename=None, ignore_discard=False, ignore_expires=False) + load(filename=None, ignore_discard=False, ignore_expires=False) + revert(filename=None, ignore_discard=False, ignore_expires=False) + clear(domain=None, path=None, key=None) + clear_session_cookies() + clear_expired_cookies() + as_string(skip_discard=False) (str(cookies) also works) + + + Public attributes + + filename: filename for loading and saving cookies + policy: CookiePolicy object + + Public readable attributes + + delayload: request that cookies are lazily loaded from disk; this is only + a hint since this only affects performance, not behaviour (unless the + cookies on disk are changing); a CookieJar object may ignore it (in fact, + only MSIECookieJar lazily loads cookies at the moment) + cookies: a three-level dictionary [domain][path][key] containing Cookie + instances; you almost certainly don't need to use this + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + + def __init__(self, filename=None, delayload=False, policy=None): + """ + See CookieJar.__doc__ for argument documentation. + + Cookies are NOT loaded from the named file until either the load or + revert method is called. + + """ + self.filename = filename + self.delayload = delayload + + if policy is None: + policy = DefaultCookiePolicy() + self.policy = policy + + self._cookies_lock = _threading.RLock() + self.cookies = {} + + # for __getitem__ iteration in pre-2.2 Pythons + self._prev_getitem_index = 0 + + def _cookies_for_domain(self, domain, request, unverifiable): + """Return a list of cookies to be returned to server.""" + debug("Checking %s for cookies to return" % domain) + if not self.policy.domain_return_ok(domain, request, unverifiable): + return [] + + cookies_by_path = self.cookies.get(domain) + if cookies_by_path is None: + return [] + + cookies = [] + for path in cookies_by_path.keys(): + if not self.policy.path_return_ok(path, request, unverifiable): + continue + for name, cookie in cookies_by_path[path].items(): + if not self.policy.return_ok(cookie, request, unverifiable): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + like ['foo="bar"; $Path="/"', ...] + + The $Version attribute is also added when appropriate (currently only + once per request). + + """ + # add cookies in order of most specific (ie. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if self.non_word_re.search(cookie.value) and version > 0: + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.name is None: + attrs.append(value) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if startswith(cookie.domain, "."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + startswith(domain, ".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request, unverifiable=False): + """Add correct Cookie: header to request (urllib2.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + The request object (usually a urllib2.Request instance) must support + the methods get_full_url, get_host, get_type and add_header, as + documented by urllib2, and the attributes headers (a mapping containing + the request's HTTP headers) and port (the port number). + + If unverifiable is true, it will be assumed that the transaction is + unverifiable as defined by RFC 2965, and appropriate action will be + taken. + + """ + debug("add_cookie_header") + self._cookies_lock.acquire() + + self.policy._now = self._now = int(time.time()) + + req_host, erhn = eff_request_host(request) + strict_non_domain = \ + self.policy.strict_ns_domain & self.policy.DomainStrictNonDomain + + cookies = [] + + domain = erhn + # First check origin server effective host name for an exact match. + cookies.extend(self._cookies_for_domain(domain, request, unverifiable)) + # Then, start with effective request-host with initial dot prepended + # (for Netscape cookies with explicitly-set Domain cookie-attributes) + # -- eg. .foo.bar.baz.com and check all possible derived domain strings + # (.bar.baz.com, bar.baz.com, .baz.com) for cookies. + # This isn't too finicky about which domains to check, because we have + # to cover both V0 and V1 cookies, and policy.return_ok will check the + # domain in any case. + if not IPV4_RE.search(req_host): + # IP addresses must string-compare equal in order to domain-match + # (IP address case will have been checked above as erhn == req_host + # in that case). + if domain != ".local": + domain = "."+domain + while string.find(domain, ".") >= 0: + cookies.extend(self._cookies_for_domain( + domain, request, unverifiable)) + if strict_non_domain: + domain = self.strict_domain_re.sub("", domain, 1) + else: + # strip either initial dot only, or initial component only + # .www.foo.com --> www.foo.com + # www.foo.com --> .foo.com + if startswith(domain, "."): + domain = domain[1:] + # we've already done the erhn + if domain == erhn: + domain = self.domain_re.sub("", domain, 1) + else: + domain = self.domain_re.sub("", domain, 1) + + attrs = self._cookie_attrs(cookies) + if attrs: + request.add_header("Cookie", string.join(attrs, "; ")) + + # if necessary, advertise that we know RFC 2965 + if self.policy.rfc2965 and not self.policy.hide_cookie2: + for cookie in cookies: + if cookie.version != 1: + request.add_header("Cookie2", '$Version="1"') + break + + self._cookies_lock.release() + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = string.lower(k) + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if standard.has_key(k): + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = string.lower(v) + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: version = int(version) + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = normalize_path(path) + else: + path_specified = False + path = request_path(request) + i = string.rfind(path, "/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(startswith(domain, ".")) + if domain is Absent: + req_host, erhn = eff_request_host(request) + domain = erhn + elif not startswith(domain, "."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + elif expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + del self.cookies[domain][path][name] + except KeyError: + pass + else: + debug("Expiring cookie, domain='%s', path='%s', name='%s'" % + (domain, path, name)) + return None + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object. + + See extract_cookies.__doc__ for the interfaces required of the + response and request arguments. + + """ + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = getheaders(headers, "Set-Cookie2") + ns_hdrs = getheaders(headers, "Set-Cookie") + + rfc2965 = self.policy.rfc2965 + netscape = self.policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return not lookup.has_key(key) + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def set_cookie_if_ok(self, cookie, request, unverifiable=False): + """Set a cookie if policy says it's OK to do so. + + cookie: ClientCookie.Cookie instance + request: see extract_cookies.__doc__ for the required interface + unverifiable: see extract_cookies.__doc__ + + """ + self._cookies_lock.acquire() + self.policy._now = self._now = int(time.time()) + + if self.policy.set_ok(cookie, request, unverifiable): + self.set_cookie(cookie) + + self._cookies_lock.release() + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set. + + cookie: ClientCookie.Cookie instance + """ + c = self.cookies + self._cookies_lock.acquire() + try: + if not c.has_key(cookie.domain): c[cookie.domain] = {} + c2 = c[cookie.domain] + if not c2.has_key(cookie.path): c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + finally: + self._cookies_lock.release() + + def extract_cookies(self, response, request, unverifiable=False): + """Extract cookies from response, where allowable given the request. + + Look for allowable Set-Cookie: and Set-Cookie2: headers in the response + object passed as argument. Any of these headers that are found are + used to update the state of the object (subject to the policy.set_ok + method's approval). + + The response object (usually be the result of a call to + ClientCookie.urlopen, or similar) should support an info method, which + returns a mimetools.Message object (in fact, the 'mimetools.Message + object' may be any object that provides a getallmatchingheaders + method). + + The request object (usually a urllib2.Request instance) must support + the methods get_full_url and get_host, as documented by urllib2, and + the attributes headers (a mapping containing the request's HTTP + headers) and port (the port number). The request is used to set + default values for cookie-attributes as well as for checking that the + cookie is OK to be set. + + If unverifiable is true, it will be assumed that the transaction is + unverifiable as defined by RFC 2965, and appropriate action will be + taken. + + """ + debug("extract_cookies: %s" % response.info()) + self._cookies_lock.acquire() + self.policy._now = self._now = int(time.time()) + + for cookie in self.make_cookies(response, request): + if self.policy.set_ok(cookie, request, unverifiable): + debug(" setting cookie: "+str(cookie)) + self.set_cookie(cookie) + self._cookies_lock.release() + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file. + + filename: name of file in which to save cookies + ignore_discard: save even cookies set to be discarded + ignore_expires: save even cookies that have expired + + The file is overwritten if it already exists, thus wiping all its + cookies. Saved cookies can be restored later using the load or revert + methods. If filename is not specified, self.filename is used; if + self.filename is None, ValueError is raised. + + The CookieJar base class saves a sequence of "Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl libary, not known + to be compatible with any browser. The MozillaCookieJar subclass can + be used to save in a format compatible with the Netscape/Mozilla + browsers. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(not ignore_discard, not ignore_expires)) + finally: + f.close() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file. + + Old cookies are kept unless overwritten by newly loaded ones. + + Cookies in the file will be loaded even if they have expired or are + marked to be discarded. + + If filename is not specified, self.filename is used; if self.filename + is None, ValueError is raised. The named file must be in the format + understood by the class, or IOError will be raised. This format will + be identical to that written by the save method, unless the load format + is not sufficiently well understood (as is the case for MSIECookieJar). + + Note for subclassers: overridden versions of this method should not + alter the object's state other than by calling self.set_cookie. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not re.search(self.magic_re, magic): + msg = "%s does not seem to contain cookies" % filename + raise IOError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not startswith(line, header): + continue + line = string.strip(line[len(header):]) + + for data in split_header_words([line]): + name, value = data[0] + # name and value are an exception here, since a plain "foo" + # (with no "=", unlike "bar=foo") means a cookie with no + # name and value "foo". With all other cookie-attributes, + # the situation is reversed: "foo" means an attribute named + # "foo" with no value! + if value is None: + name, value = value, name + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = string.lower(k) + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = startswith(domain, ".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except: + reraise_unmasked_exceptions((IOError,)) + raise IOError("invalid Set-Cookie3 format file %s" % filename) + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises IOError if reversion is not successful; the object's state will + not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + self._cookies_lock.acquire() + + old_state = copy.deepcopy(self.cookies) + self.cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except IOError: + self.cookies = old_state + raise + + self._cookies_lock.release() + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self.cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self.cookies[domain][path] + elif domain is not None: + del self.cookies[domain] + else: + self.cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Discards all cookies held by object which had either no Max-Age or + Expires cookie-attribute or an explicit Discard cookie-attribute, or + which otherwise have ended up with a true discard attribute. For + interactive browsers, the end of a session usually corresponds to + closing the browser window. + + Note that the save method won't save session cookies anyway, unless you + ask otherwise by passing a true ignore_discard argument. + + """ + self._cookies_lock.acquire() + for cookie in self: + if cookie.discard: + del self.cookies[cookie.domain][cookie.path][cookie.name] + self._cookies_lock.release() + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the save + method won't save expired cookies anyway (unless you ask otherwise by + passing a true ignore_expires argument). + + """ + self._cookies_lock.acquire() + now = time.time() + for cookie in self: + if cookie.is_expired(now): + del self.cookies[cookie.domain][cookie.path][cookie.name] + self._cookies_lock.release() + + def __getitem__(self, i): + if i == 0: + self._getitem_iterator = self.__iter__() + elif self._prev_getitem_index != i-1: raise IndexError( + "CookieJar.__getitem__ only supports sequential iteration") + self._prev_getitem_index = i + try: + return self._getitem_iterator.next() + except StopIteration: + raise IndexError() + + def __iter__(self): + return MappingIterator(self.cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, string.join(r, ", ")) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, string.join(r, ", ")) + + def as_lwp_str(self, skip_discard=False, skip_expired=False): + """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. + + If skip_discard is true, it will not return lines for cookies with the + Discard cookie-attribute. + + """ + now = time.time() + r = [] + for cookie in self: + if skip_discard and cookie.discard: + continue + if skip_expired and cookie.is_expired(now): + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return string.join(r+[""], "\n") diff --git a/tools/bug_tool/ClientCookie/_Debug.py b/tools/bug_tool/ClientCookie/_Debug.py new file mode 100644 index 0000000000..17f050e252 --- /dev/null +++ b/tools/bug_tool/ClientCookie/_Debug.py @@ -0,0 +1,9 @@ +import ClientCookie + +def debug(text): + if ClientCookie.CLIENTCOOKIE_DEBUG: _debug(text) + +def _debug(text, *args): + if args: + text = text % args + ClientCookie.DEBUG_STREAM.write(text+"\n") diff --git a/tools/bug_tool/ClientCookie/_HeadersUtil.py b/tools/bug_tool/ClientCookie/_HeadersUtil.py new file mode 100644 index 0000000000..da7852c4e0 --- /dev/null +++ b/tools/bug_tool/ClientCookie/_HeadersUtil.py @@ -0,0 +1,224 @@ +"""HTTP header value parsing utility functions. + +from ClientCookie._HeadersUtil import split_header_words +values = split_header_words(h.headers["Content-Type"]) + +This module provides a few functions that help parsing and construction of +valid HTTP header values. + + +Copyright 1997-1998, Gisle Aas +Copyright 2002-2003, John J. Lee + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD License (see the file COPYING included with the +distribution). + +""" + +import re, string +from types import StringType +try: + from types import UnicodeType + STRING_TYPES = StringType, UnicodeType +except: + STRING_TYPES = StringType, + +from _Util import startswith, endswith, http2time + +try: True +except NameError: + True = 1 + False = 0 + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +# XXX I really can't see what this =* was for (came from LWP, I guess) +#token_re = re.compile(r"^\s*(=*[^\s=;,]+)") +token_re = re.compile(r"^\s*([^=\s;,]+)") +quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +value_re = re.compile(r"^\s*=\s*([^\s;,]*)") +escape_re = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1*<any CHAR except CTLs or separators> + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = <any TEXT except <">> + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert type(header_values) not in STRING_TYPES + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = token_re.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = quoted_value_re.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = escape_re.sub(r"\1", value) + else: + m = value_re.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = string.rstrip(value) + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif startswith(string.lstrip(text), ","): + # concatenated headers, as per RFC 2616 section 4.2 + text = string.lstrip(text)[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +join_escape_re = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) + 'text/plain; charset="iso-8859/1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) + 'text/plain, charset="iso-8859/1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = join_escape_re.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + if k is None: # Netscape cookies may have no name + k = v + else: + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append(string.join(attr, "; ")) + return string.join(headers, ", ") + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + for param in re.split(r";\s*", ns_header): + param = string.rstrip(param) + if param == "": continue + if "=" not in param: + if string.lower(param) in known_attrs: + k, v = param, None + else: + # cookie with missing name + k, v = None, param + else: + k, v = re.split(r"\s*=\s*", param, 1) + k = string.lstrip(k) + if k is not None: + lc = string.lower(k) + if lc in known_attrs: + k = lc + if k == "version": + # This is an RFC 2109 cookie. Will be treated as RFC 2965 + # cookie in rest of code. + # Probably it should be parsed with split_header_words, but + # that's too much hassle. + version_set = True + if k == "expires": + # convert expires date to seconds since epoch + if startswith(v, '"'): v = v[1:] + if endswith(v, '"'): v = v[:-1] + v = http2time(v) # None if invalid + pairs.append((k, v)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +def _test(): + import doctest, _HeadersUtil + return doctest.testmod(_HeadersUtil) + +if __name__ == "__main__": + _test() diff --git a/tools/bug_tool/ClientCookie/_MSIECookieJar.py b/tools/bug_tool/ClientCookie/_MSIECookieJar.py new file mode 100644 index 0000000000..5c2d3fcf24 --- /dev/null +++ b/tools/bug_tool/ClientCookie/_MSIECookieJar.py @@ -0,0 +1,377 @@ +"""Mozilla / Netscape cookie loading / saving. + +Copyright 1997-1999 Gisle Aas (libwww-perl) +Copyright 2002-2003 Johnny Lee <typo_pl@hotmail.com> (MSIE Perl code) +Copyright 2002-2003 John J Lee <jjl@pobox.com> (The Python port) + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD License (see the file COPYING included with the +distribution). + +""" + +import os, re, string, time, struct +if os.name == "nt": + import _winreg + +from _ClientCookie import CookieJar, Cookie, MISSING_FILENAME_TEXT +from _Util import startswith +from _Debug import debug + +try: True +except NameError: + True = 1 + False = 0 + + +def regload(path, leaf): + key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, path, 0, _winreg.KEY_ALL_ACCESS) + try: + value = _winreg.QueryValueEx(key, leaf)[0] + except WindowsError: + value = None + return value + +WIN32_EPOCH = 0x019db1ded53e8000L # 1970 Jan 01 00:00:00 in Win32 FILETIME + +def epoch_time_offset_from_win32_filetime(filetime): + """Convert from win32 filetime to seconds-since-epoch value. + + MSIE stores create and expire times as Win32 FILETIME, which is 64 + bits of 100 nanosecond intervals since Jan 01 1601. + + Cookies code expects time in 32-bit value expressed in seconds since + the epoch (Jan 01 1970). + + """ + if filetime < WIN32_EPOCH: + raise ValueError("filetime (%d) is before epoch (%d)" % + (filetime, WIN32_EPOCH)) + + return divmod((filetime - WIN32_EPOCH), 10000000L)[0] + +def binary_to_char(c): return "%02X" % ord(c) +def binary_to_str(d): return string.join(map(binary_to_char, list(d)), "") + +class MSIECookieJar(CookieJar): + """ + This class differs from CookieJar only in the format it uses to load cookies + from a file. + + MSIECookieJar can read the cookie files of Microsoft Internet Explorer + (MSIE) for Windows, versions 5 and 6, on Windows NT and XP respectively. + Other configurations may also work, but are untested. Saving cookies in + MSIE format is NOT supported. If you save cookies, they'll be in the usual + Set-Cookie3 format, which you can read back in using an instance of the + plain old CookieJar class. Don't save using the same filename that you + loaded cookies from, because you may succeed in clobbering your MSIE + cookies index file! + + You should be able to have LWP share Internet Explorer's cookies like + this (note you need to supply a username to load_from_registry if you're on + Windows 9x): + + cookies = MSIECookieJar(delayload=1) + # find cookies index file in registry and load cookies from it + cookies.load_from_registry() + opener = ClientCookie.build_opener(ClientCookie.HTTPHandler(cookies)) + response = opener.open("http://foo.bar.com/") + + Iterating over a delayloaded MSIECookieJar instance will not cause any + cookies to be read from disk. To force reading of all cookies from disk, + call read_all_cookies. Note that the following methods iterate over self: + clear_temporary_cookies, clear_expired_cookies, __len__, __repr__, __str__ + and as_string. + + Additional methods: + + load_from_registry(ignore_discard=False, ignore_expires=False, + username=None) + load_cookie_data(filename, ignore_discard=False, ignore_expires=False) + read_all_cookies() + + """ + magic_re = re.compile(r"Client UrlCache MMF Ver \d\.\d.*") + padding = "\x0d\xf0\xad\x0b" + + msie_domain_re = re.compile(r"^([^/]+)(/.*)$") + cookie_re = re.compile("Cookie\:.+\@([\x21-\xFF]+).*?" + "(.+\@[\x21-\xFF]+\.txt)") + + # path under HKEY_CURRENT_USER from which to get location of index.dat + reg_path = r"software\microsoft\windows" \ + r"\currentversion\explorer\shell folders" + reg_key = "Cookies" + + def __init__(self, *args, **kwargs): + apply(CookieJar.__init__, (self, args, kwargs)) + self._delayload_domains = {} + + def set_cookie(self, cookie): + if self.delayload: + self._delayload_domain(cookie.domain) + CookieJar.set_cookie(self, cookie) + + def _cookies_for_domain(self, domain, request, unverifiable): + debug("Checking %s for cookies to return" % domain) + if not self.policy.domain_return_ok(domain, request, unverifiable): + return [] + + if self.delayload: + self._delayload_domain(domain) + + return CookieJar._cookies_for_domain( + self, domain, request, unverifiable) + + def read_all_cookies(self): + """Eagerly read in all cookies.""" + if self.delayload: + for domain in self._delayload_domains.keys(): + self._delayload_domain(domain) + + def _delayload_domain(self, domain): + # if necessary, lazily load cookies for this domain + delayload_info = self._delayload_domains.get(domain) + if delayload_info is not None: + cookie_file, ignore_discard, ignore_expires = delayload_info + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except IOError: + debug("error reading cookie file, skipping: %s" % cookie_file) + else: + del self._delayload_domains[domain] + + def _load_cookies_from_file(self, filename): + cookies = [] + + cookies_fh = open(filename) + + try: + while 1: + key = cookies_fh.readline() + if key == "": break + + rl = cookies_fh.readline + def getlong(rl=rl): return long(rl().rstrip()) + def getstr(rl=rl): return rl().rstrip() + + key = key.rstrip() + value = getstr() + domain_path = getstr() + flags = getlong() # 0x2000 bit is for secure I think + lo_expire = getlong() + hi_expire = getlong() + lo_create = getlong() + hi_create = getlong() + sep = getstr() + + if "" in (key, value, domain_path, flags, hi_expire, lo_expire, + hi_create, lo_create, sep) or (sep != "*"): + break + + m = self.msie_domain_re.search(domain_path) + if m: + domain = m.group(1) + path = m.group(2) + + cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain, + "PATH": path, "FLAGS": flags, "HIXP": hi_expire, + "LOXP": lo_expire, "HICREATE": hi_create, + "LOCREATE": lo_create}) + finally: + cookies_fh.close() + + return cookies + + def load_cookie_data(self, filename, + ignore_discard=False, ignore_expires=False): + """Load cookies from file containing actual cookie data. + + Old cookies are kept unless overwritten by newly loaded ones. + + You should not call this method if the delayload attribute is set. + + I think each of these files contain all cookies for one user, domain, + and path. + + filename: file containing cookies -- usually found in a file like + C:\WINNT\Profiles\joe\Cookies\joe@blah[1].txt + + """ + now = int(time.time()) + + cookie_data = self._load_cookies_from_file(filename) + + for cookie in cookie_data: + flags = cookie["FLAGS"] + secure = ((flags & 0x2000) != 0) + filetime = (cookie["HIXP"] << 32) + cookie["LOXP"] + expires = epoch_time_offset_from_win32_filetime(filetime) + if expires < now: + discard = True + else: + discard = False + domain = cookie["DOMAIN"] + initial_dot = startswith(domain, ".") + if initial_dot: + domain_specified = True + else: + # MSIE 5 does not record whether the domain cookie-attribute + # was specified. + # Assuming it wasn't is conservative, because with strict + # domain matching this will match less frequently; with regular + # Netscape tail-matching, this will match at exactly the same + # times that domain_specified = True would. It also means we + # don't have to prepend a dot to achieve consistency with our + # own & Mozilla's domain-munging scheme. + domain_specified = False + + # assume path_specified is false + # XXX is there other stuff in here? -- eg. comment, commentURL? + c = Cookie(0, + cookie["KEY"], cookie["VALUE"], + None, False, + domain, domain_specified, initial_dot, + cookie["PATH"], False, + secure, + expires, + discard, + None, + None, + {"flags": flags}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + def load_from_registry(self, ignore_discard=False, ignore_expires=False, + username=None): + """ + username: only required on win9x + + """ + cookies_dir = regload(self.reg_path, self.reg_key) + filename = os.path.normpath(os.path.join(cookies_dir, "INDEX.DAT")) + self.load(filename, ignore_discard, ignore_expires, username) + + def load(self, filename, ignore_discard=False, ignore_expires=False, + username=None): + """Load cookies from an MSIE 'index.dat' cookies index file. + + filename: full path to cookie index file + username: only required on win9x + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + index = open(filename, "rb") + + try: + self._really_load(index, filename, ignore_discard, ignore_expires, + username) + finally: + index.close() + + def _really_load(self, index, filename, ignore_discard, ignore_expires, + username): + now = int(time.time()) + + if username is None: + username = string.lower(os.environ['USERNAME']) + + cookie_dir = os.path.dirname(filename) + + data = index.read(256) + if len(data) != 256: + raise IOError("%s file is too short" % filename) + + # Cookies' index.dat file starts with 32 bytes of signature + # followed by an offset to the first record, stored as a little- + # endian DWORD. + sig, size, data = data[:32], data[32:36], data[36:] + size = struct.unpack("<L", size)[0] + + # check that sig is valid + if not self.magic_re.match(sig) or size != 0x4000: + raise IOError("%s ['%s' %s] does not seem to contain cookies" % + (str(filename), sig, size)) + + # skip to start of first record + index.seek(size, 0) + + sector = 128 # size of sector in bytes + + while 1: + data = "" + + # Cookies are usually in two contiguous sectors, so read in two + # sectors and adjust if not a Cookie. + to_read = 2 * sector + d = index.read(to_read) + if len(d) != to_read: + break + data = data + d + + # Each record starts with a 4-byte signature and a count + # (little-endian DWORD) of sectors for the record. + sig, size, data = data[:4], data[4:8], data[8:] + size = struct.unpack("<L", size)[0] + + to_read = (size - 2) * sector + +## from urllib import quote +## print "data", quote(data) +## print "sig", quote(sig) +## print "size in sectors", size +## print "size in bytes", size*sector +## print "size in units of 16 bytes", (size*sector) / 16 +## print "size to read in bytes", to_read +## print + + if sig != "URL ": + assert (sig in ("HASH", "LEAK", + self.padding, "\x00\x00\x00\x00"), + "unrecognized MSIE index.dat record: %s" % + binary_to_str(sig)) + if sig == "\x00\x00\x00\x00": + # assume we've got all the cookies, and stop + break + if sig == self.padding: + continue + # skip the rest of this record + assert to_read >= 0 + if size != 2: + assert to_read != 0 + index.seek(to_read, 1) + continue + + # read in rest of record if necessary + if size > 2: + more_data = index.read(to_read) + if len(more_data) != to_read: break + data = data + more_data + + cookie_re = ("Cookie\:%s\@([\x21-\xFF]+).*?" % username + + "(%s\@[\x21-\xFF]+\.txt)" % username) + m = re.search(cookie_re, data, re.I) + if m: + cookie_file = os.path.join(cookie_dir, m.group(2)) + if not self.delayload: + try: + self.load_cookie_data(cookie_file, + ignore_discard, ignore_expires) + except IOError: + debug("error reading cookie file, skipping: %s" % + cookie_file) + else: + domain = m.group(1) + i = domain.find("/") + if i != -1: + domain = domain[:i] + + self._delayload_domains[domain] = ( + cookie_file, ignore_discard, ignore_expires) diff --git a/tools/bug_tool/ClientCookie/_MozillaCookieJar.py b/tools/bug_tool/ClientCookie/_MozillaCookieJar.py new file mode 100644 index 0000000000..13239c3c54 --- /dev/null +++ b/tools/bug_tool/ClientCookie/_MozillaCookieJar.py @@ -0,0 +1,171 @@ +"""Mozilla / Netscape cookie loading / saving. + +Copyright 1997-1999 Gisle Aas (libwww-perl) +Copyright 2002-2003 John J Lee <jjl@pobox.com> (The Python port) + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD License (see the file COPYING included with the +distribution). + +""" + +import sys, re, string, time + +import ClientCookie +from _ClientCookie import CookieJar, Cookie, MISSING_FILENAME_TEXT +from _Util import startswith, endswith +from _Debug import debug + +try: True +except NameError: + True = 1 + False = 0 + +try: issubclass(Exception(), (Exception,)) +except TypeError: + real_issubclass = issubclass + from _Util import compat_issubclass + issubclass = compat_issubclass + del compat_issubclass + + +class MozillaCookieJar(CookieJar): + """ + + WARNING: you may want to backup your browser's cookies file if you use + this class to save cookies. I *think* it works, but there have been + bugs in the past! + + This class differs from CookieJar only in the format it uses to save and + load cookies to and from a file. This class uses the Netscape/Mozilla + `cookies.txt' format. + + Don't expect cookies saved while the browser is running to be noticed by + the browser (in fact, Mozilla on unix will overwrite your saved cookies if + you change them on disk while it's running; on Windows, you probably can't + save at all while the browser is running). + + Note that the Netscape/Mozilla format will downgrade RFC2965 cookies to + Netscape cookies on saving. + + In particular, the cookie version and port number information is lost, + together with information about whether or not Path, Port and Discard were + specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the + domain as set in the HTTP header started with a dot (yes, I'm aware some + domains in Netscape files start with a dot and some don't -- trust me, you + really don't want to know any more about this). + + Note that though Mozilla and Netscape use the same format, they use + slightly different headers. The class saves cookies using the Netscape + header by default (Mozilla can cope with that). + + """ + magic_re = "#( Netscape)? HTTP Cookie File" + header = """\ + # Netscape HTTP Cookie File + # http://www.netscape.com/newsref/std/cookie_spec.html + # This is a generated file! Do not edit. + +""" + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + now = time.time() + + magic = f.readline() + if not re.search(self.magic_re, magic): + f.close() + raise IOError( + "%s does not look like a Netscape format cookies file" % + filename) + + try: + while 1: + line = f.readline() + if line == "": break + + # last field may be absent, so keep any trailing tab + if endswith(line, "\n"): line = line[:-1] + + # skip comments and blank lines XXX what is $ for? + if (startswith(string.strip(line), "#") or + startswith(string.strip(line), "$") or + string.strip(line) == ""): + continue + + domain, domain_specified, path, secure, expires, name, value = \ + string.split(line, "\t") + secure = (secure == "TRUE") + domain_specified = (domain_specified == "TRUE") + if name == "": name = None + + initial_dot = startswith(domain, ".") + assert domain_specified == initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + # assume path_specified is false + c = Cookie(0, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + {}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + except: + unmasked = (KeyboardInterrupt, SystemExit) + if ClientCookie.CLIENTCOOKIE_DEBUG: + unmasked = (Exception,) + etype = sys.exc_info()[0] + if issubclass(etype, IOError) or \ + issubclass(etype, unmasked): + raise + raise IOError("invalid Netscape format file %s: %s" % + (filename, line)) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + f.write(self.header) + now = time.time() + debug("Saving Netscape cookies.txt file") + for cookie in self: + if not ignore_discard and cookie.discard: + debug(" Not saving %s: marked for discard" % cookie.name) + continue + if not ignore_expires and cookie.is_expired(now): + debug(" Not saving %s: expired" % cookie.name) + continue + if cookie.secure: secure = "TRUE" + else: secure = "FALSE" + if startswith(cookie.domain, "."): initial_dot = "TRUE" + else: initial_dot = "FALSE" + if cookie.expires is not None: + expires = str(cookie.expires) + else: + expires = "" + if cookie.name is not None: + name = cookie.name + else: + name = "" + f.write( + string.join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, cookie.value], "\t")+ + "\n") + finally: + f.close() diff --git a/tools/bug_tool/ClientCookie/_Util.py b/tools/bug_tool/ClientCookie/_Util.py new file mode 100644 index 0000000000..f4c4e37ccf --- /dev/null +++ b/tools/bug_tool/ClientCookie/_Util.py @@ -0,0 +1,459 @@ +"""Python backwards-compat., date/time routines, seekable file object wrapper. + + Copyright 2002-2003 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD License (see the file COPYING included with the +distribution). + +""" + +try: True +except NameError: + True = 1 + False = 0 + +import re, string, time +from types import TupleType +from StringIO import StringIO + +try: + from exceptions import StopIteration +except ImportError: + from ClientCookie._ClientCookie import StopIteration + +def startswith(string, initial): + if len(initial) > len(string): return False + return string[:len(initial)] == initial + +def endswith(string, final): + if len(final) > len(string): return False + return string[-len(final):] == final + +def compat_issubclass(obj, tuple_or_class): + # for 2.1 and below + if type(tuple_or_class) == TupleType: + for klass in tuple_or_class: + if issubclass(obj, klass): + return True + return False + return issubclass(obj, tuple_or_class) + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +try: + from calendar import timegm + timegm((2045, 1, 1, 22, 23, 32)) # overflows in 2.1 +except: + # Number of days per month (except for February in leap years) + mdays = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] + + # Return 1 for leap years, 0 for non-leap years + def isleap(year): + return year % 4 == 0 and (year % 100 <> 0 or year % 400 == 0) + + # Return number of leap years in range [y1, y2) + # Assume y1 <= y2 and no funny (non-leap century) years + def leapdays(y1, y2): + return (y2+3)/4 - (y1+3)/4 + + EPOCH = 1970 + def timegm(tuple): + """Unrelated but handy function to calculate Unix timestamp from GMT.""" + year, month, day, hour, minute, second = tuple[:6] + assert year >= EPOCH + assert 1 <= month <= 12 + days = 365*(year-EPOCH) + leapdays(EPOCH, year) + for i in range(1, month): + days = days + mdays[i] + if month > 2 and isleap(year): + days = days + 1 + days = days + day - 1 + hours = days*24 + hour + minutes = hours*60 + minute + seconds = minutes*60L + second + return seconds + + +# Date/time conversion routines for formats used by the HTTP protocol. + +EPOCH = 1970 +def my_timegm(tt): + year, month, mday, hour, min, sec = tt[:6] + if ((year >= EPOCH) and (1 <= month <= 12) and (1 <= mday <= 31) and + (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): + return timegm(tt) + else: + return None + +days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] +months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] +months_lower = [] +for month in months: months_lower.append(string.lower(month)) + + +def time2isoz(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", + representing Universal Time (UTC, aka GMT). An example of this format is: + + 1994-11-24 08:49:37Z + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec = time.gmtime(t)[:6] + return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( + year, mon, mday, hour, min, sec) + +def time2netscape(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like this: + + Wdy, DD-Mon-YYYY HH:MM:SS GMT + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] + return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( + days[wday], mday, months[mon-1], year, hour, min, sec) + + +UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} + +timezone_re = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") +def offset_from_tz_string(tz): + offset = None + if UTC_ZONES.has_key(tz): + offset = 0 + else: + m = timezone_re.search(tz) + if m: + offset = 3600 * int(m.group(2)) + if m.group(3): + offset = offset + 60 * int(m.group(3)) + if m.group(1) == '-': + offset = -offset + return offset + +def _str2time(day, mon, yr, hr, min, sec, tz): + # translate month name to number + # month numbers start with 1 (January) + try: + mon = months_lower.index(string.lower(mon))+1 + except ValueError: + # maybe it's already a number + try: + imon = int(mon) + except ValueError: + return None + if 1 <= imon <= 12: + mon = imon + else: + return None + + # make sure clock elements are defined + if hr is None: hr = 0 + if min is None: min = 0 + if sec is None: sec = 0 + + yr = int(yr) + day = int(day) + hr = int(hr) + min = int(min) + sec = int(sec) + + if yr < 1000: + # find "obvious" year + cur_yr = time.localtime(time.time())[0] + m = cur_yr % 100 + tmp = yr + yr = yr + cur_yr - m + m = m - tmp + if abs(m) > 50: + if m > 0: yr = yr + 100 + else: yr = yr - 100 + + # convert UTC time tuple to seconds since epoch (not timezone-adjusted) + t = my_timegm((yr, mon, day, hr, min, sec, tz)) + + if t is not None: + # adjust time using timezone string, to get absolute time since epoch + if tz is None: + tz = "UTC" + tz = string.upper(tz) + offset = offset_from_tz_string(tz) + if offset is None: + return None + t = t - offset + + return t + + +strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") +wkday_re = re.compile( + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) +loose_http_re = re.compile( + r"""^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$""", re.X) +def http2time(text): + """Returns time in seconds since epoch of time represented by a string. + + Return value is an integer. + + None is returned if the format of str is unrecognized, the time is outside + the representable range, or the timezone string is not recognized. The + time formats recognized are the same as for parse_date. If the string + contains no timezone, UTC is assumed. + + The timezone in the string may be numerical (like "-0800" or "+0100") or a + string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the + timezone strings equivalent to UTC (zero offset) are known to the function. + + The function loosely parses the following formats: + + Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format + Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format + Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format + 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) + 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) + 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) + + The parser ignores leading and trailing whitespace. The time may be + absent. + + If the year is given with only 2 digits, then parse_date will select the + century that makes the year closest to the current date. + + """ + # fast exit for strictly conforming string + m = strict_re.search(text) + if m: + g = m.groups() + mon = months_lower.index(string.lower(g[1])) + 1 + tt = (int(g[2]), mon, int(g[0]), + int(g[3]), int(g[4]), float(g[5])) + return my_timegm(tt) + + # No, we need some messy parsing... + + # clean up + text = string.lstrip(text) + text = wkday_re.sub("", text, 1) # Useless weekday + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = loose_http_re.search(text) + if m is not None: + day, mon, yr, hr, min, sec, tz = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + +iso_re = re.compile( + """^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$""", re.X) +def iso2time(text): + """ + As for httpstr2time, but parses the ISO 8601 formats: + + 1994-02-03 14:15:29 -0100 -- ISO 8601 format + 1994-02-03 14:15:29 -- zone is optional + 1994-02-03 -- only date + 1994-02-03T14:15:29 -- Use T as separator + 19940203T141529Z -- ISO 8601 compact format + 19940203 -- only date + + """ + # clean up + text = string.lstrip(text) + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = iso_re.search(text) + if m is not None: + # XXX there's an extra bit of the timezone I'm ignoring here: is + # this the right thing to do? + yr, mon, day, hr, min, sec, tz, _ = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + + +# XXX Andrew Dalke kindly sent me a similar class in response to my request on +# comp.lang.python, which I then proceeded to lose. I wrote this class +# instead, but I think he's released his code publicly since, could pinch the +# tests from it, at least... +class seek_wrapper: + """Adds a seek method to a file object. + + This is only designed for seeking on readonly file-like objects. + + Wrapped file-like object must have a read method. The readline method is + only supported if that method is present on the wrapped object. The + readlines method is always supported. xreadlines and iteration are + supported only for Python 2.2 and above. + + Public attribute: wrapped (the wrapped file object). + + WARNING: All other attributes of the wrapped object (ie. those that are not + one of wrapped, read, readline, readlines, xreadlines, __iter__ and next) + are passed through unaltered, which may or may not make sense for your + particular file object. + + """ + # General strategy is to check that cache is full enough, then delegate + # everything to the cache (self._cache, which is a StringIO.StringIO + # instance. Seems to be some cStringIO.StringIO problem on 1.5.2 -- I + # get a StringOobject, with no readlines method. + + # Invariant: the end of the cache is always at the same place as the + # end of the wrapped file: + # self.wrapped.tell() == len(self._cache.getvalue()) + + def __init__(self, wrapped): + self.wrapped = wrapped + self.__have_readline = hasattr(self.wrapped, "readline") + self.__cache = StringIO() + + def __getattr__(self, name): return getattr(self.wrapped, name) + + def seek(self, offset, whence=0): + # make sure we have read all data up to the point we are seeking to + pos = self.__cache.tell() + if whence == 0: # absolute + to_read = offset - pos + elif whence == 1: # relative to current position + to_read = offset + elif whence == 2: # relative to end of *wrapped* file + # since we don't know yet where the end of that file is, we must + # read everything + to_read = None + if to_read >= 0 or to_read is None: + if to_read is None: + self.__cache.write(self.wrapped.read()) + else: + self.__cache.write(self.wrapped.read(to_read)) + self.__cache.seek(pos) + + return self.__cache.seek(offset, whence) + + def read(self, size=-1): + pos = self.__cache.tell() + + self.__cache.seek(pos) + + end = len(self.__cache.getvalue()) + available = end - pos + + # enough data already cached? + if size <= available and size != -1: + return self.__cache.read(size) + + # no, so read sufficient data from wrapped file and cache it + to_read = size - available + assert to_read > 0 or size == -1 + self.__cache.seek(0, 2) + if size == -1: + self.__cache.write(self.wrapped.read()) + else: + self.__cache.write(self.wrapped.read(to_read)) + self.__cache.seek(pos) + + return self.__cache.read(size) + + def readline(self, size=-1): + if not self.__have_readline: + raise NotImplementedError("no readline method on wrapped object") + + # line we're about to read might not be complete in the cache, so + # read another line first + pos = self.__cache.tell() + self.__cache.seek(0, 2) + self.__cache.write(self.wrapped.readline()) + self.__cache.seek(pos) + + data = self.__cache.readline() + if size != -1: + r = data[:size] + self.__cache.seek(pos+size) + else: + r = data + return r + + def readlines(self, sizehint=-1): + pos = self.__cache.tell() + self.__cache.seek(0, 2) + self.__cache.write(self.wrapped.read()) + self.__cache.seek(pos) + try: + return self.__cache.readlines(sizehint) + except TypeError: # 1.5.2 hack + return self.__cache.readlines() + + def __iter__(self): return self + def next(self): + line = self.readline() + if line == "": raise StopIteration + return line + + xreadlines = __iter__ + + def __repr__(self): + return ("<%s at %s whose wrapped object = %s>" % + (self.__class__.__name__, `id(self)`, `self.wrapped`)) + + def close(self): + self.read = None + self.readline = None + self.readlines = None + self.seek = None + if self.wrapped: self.wrapped.close() + self.wrapped = None diff --git a/tools/bug_tool/ClientCookie/__init__.py b/tools/bug_tool/ClientCookie/__init__.py new file mode 100644 index 0000000000..a5d9c95f4b --- /dev/null +++ b/tools/bug_tool/ClientCookie/__init__.py @@ -0,0 +1,49 @@ +# Import names so that they can be imported directly from the package, like +# this: +#from ClientCookie import <whatever> + +try: True +except NameError: + True = 1 + False = 0 + +import sys + +# don't edit these here: do eg. +# import ClientCookie; ClientCookie.HTTP_DEBUG = 1 +DEBUG_STREAM = sys.stderr +CLIENTCOOKIE_DEBUG = False +REDIRECT_DEBUG = False +HTTP_DEBUG = False + +from _ClientCookie import VERSION, __doc__, \ + CookieJar, Cookie, \ + CookiePolicy, DefaultCookiePolicy, \ + lwp_cookie_str +from _MozillaCookieJar import MozillaCookieJar +from _MSIECookieJar import MSIECookieJar +try: + from urllib2 import AbstractHTTPHandler +except ImportError: + pass +else: + from ClientCookie._urllib2_support import \ + HTTPHandler, build_opener, install_opener, urlopen, \ + HTTPRedirectHandler + from ClientCookie._urllib2_support import \ + OpenerDirector, BaseProcessor, \ + HTTPRequestUpgradeProcessor, \ + HTTPEquivProcessor, SeekableProcessor, HTTPCookieProcessor, \ + HTTPRefererProcessor, HTTPStandardHeadersProcessor, \ + HTTPRefreshProcessor, HTTPErrorProcessor, \ + HTTPResponseDebugProcessor + + import httplib + if hasattr(httplib, 'HTTPS'): + from ClientCookie._urllib2_support import HTTPSHandler + del AbstractHTTPHandler, httplib +from _Util import http2time +str2time = http2time +del http2time + +del sys diff --git a/tools/bug_tool/ClientCookie/_urllib2_support.py b/tools/bug_tool/ClientCookie/_urllib2_support.py new file mode 100644 index 0000000000..d767d08b25 --- /dev/null +++ b/tools/bug_tool/ClientCookie/_urllib2_support.py @@ -0,0 +1,713 @@ +"""Integration with Python standard library module urllib2. + +Also includes a redirection bugfix, support for parsing HTML HEAD blocks for +the META HTTP-EQUIV tag contents, and following Refresh header redirects. + +Copyright 2002-2003 John J Lee <jjl@pobox.com> + +This code is free software; you can redistribute it and/or modify it under +the terms of the BSD License (see the file COPYING included with the +distribution). + +""" + +import copy, time + +import ClientCookie +from _ClientCookie import CookieJar, request_host +from _Util import isstringlike +from _Debug import _debug + +try: True +except NameError: + True = 1 + False = 0 + +CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes + +try: + from urllib2 import AbstractHTTPHandler +except ImportError: + pass +else: + import urlparse, urllib2, urllib, httplib, htmllib, formatter, string + from urllib2 import URLError, HTTPError + import types, string, socket + from cStringIO import StringIO + from _Util import seek_wrapper + try: + import threading + _threading = threading; del threading + except ImportError: + import dummy_threading + _threading = dummy_threading; del dummy_threading + + # This fixes a bug in urllib2 as of Python 2.1.3 and 2.2.2 + # (http://www.python.org/sf/549151) + # 2.2.3 is broken here (my fault!), 2.3 is fixed. + class HTTPRedirectHandler(urllib2.BaseHandler): + # maximum number of redirections before assuming we're in a loop + max_redirections = 10 + + # Implementation notes: + + # To avoid the server sending us into an infinite loop, the request + # object needs to track what URLs we have already seen. Do this by + # adding a handler-specific attribute to the Request object. The value + # of the dict is used to count the number of times the same url has + # been visited. This is needed because this isn't necessarily a loop: + # there is more than one way to redirect (Refresh, 302, 303, 307). + + # Another handler-specific Request attribute, original_url, is used to + # remember the URL of the original request so that it is possible to + # decide whether or not RFC 2965 cookies should be turned on during + # redirect. + + # Always unhandled redirection codes: + # 300 Multiple Choices: should not handle this here. + # 304 Not Modified: no need to handle here: only of interest to caches + # that do conditional GETs + # 305 Use Proxy: probably not worth dealing with here + # 306 Unused: what was this for in the previous versions of protocol?? + + def redirect_request(self, newurl, req, fp, code, msg, headers): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a redirection + response is received. If a redirection should take place, return a + new Request to allow http_error_30x to perform the redirect; + otherwise, return None to indicate that an HTTPError should be + raised. + + """ + if code in (301, 302, 303) or (code == 307 and not req.has_data()): + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + return Request(newurl, headers=req.headers) + else: + raise HTTPError(req.get_full_url(), code, msg, headers, fp) + + def http_error_302(self, req, fp, code, msg, headers): + if headers.has_key('location'): + newurl = headers['location'] + elif headers.has_key('uri'): + newurl = headers['uri'] + else: + return + newurl = urlparse.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(newurl, req, fp, code, msg, headers) + if new is None: + return + + # remember where we started from + if hasattr(req, "original_url"): + new.original_url = req.original_url + else: + new.original_url = req.get_full_url() + + # loop detection + # .error_302_dict[(url, code)] is number of times url + # previously visited as a result of a redirection with this + # code (error_30x_dict would be a better name). + new.origin_req_host = req.origin_req_host + if not hasattr(req, 'error_302_dict'): + new.error_302_dict = req.error_302_dict = {(newurl, code): 1} + else: + ed = new.error_302_dict = req.error_302_dict + nr_visits = ed.get((newurl, code), 0) + # Refreshes generate fake 302s, so we can hit the same URL as + # a result of the same redirection code twice without + # necessarily being in a loop! So, allow two visits to each + # URL as a result of each redirection code. + if len(ed) < self.max_redirections and nr_visits < 2: + ed[(newurl, code)] = nr_visits + 1 + else: + raise HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + + if ClientCookie.REDIRECT_DEBUG: + _debug("redirecting to %s", newurl) + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + + class Request(urllib2.Request): + def __init__(self, url, data=None, headers={}): + urllib2.Request.__init__(self, url, data, headers) + self.unredirected_hdrs = {} + + def add_unredirected_header(self, key, val): + # these headers do not persist from one request to the next in a chain + # of requests + self.unredirected_hdrs[string.capitalize(key)] = val + + def has_key(self, header_name): + if (self.headers.has_key(header_name) or + self.unredirected_hdrs.has_key(header_name)): + return True + return False + + def get(self, header_name, failobj=None): + if self.headers.has_key(header_name): + return self.headers[header_name] + if self.unredirected_headers.has_key(header_name): + return self.unredirected_headers[header_name] + return failobj + + + class BaseProcessor: + processor_order = 500 + + def add_parent(self, parent): + self.parent = parent + def close(self): + self.parent = None + def __lt__(self, other): + if not hasattr(other, "processor_order"): + return True + return self.processor_order < other.processor_order + + class HTTPRequestUpgradeProcessor(BaseProcessor): + # upgrade Request to class with support for headers that don't get + # redirected + processor_order = 0 # before anything else + + def http_request(self, request): + if not hasattr(request, "add_unredirected_header"): + request = Request(request._Request__original, request.data, + request.headers) + return request + + https_request = http_request + + class HTTPEquivProcessor(BaseProcessor): + """Append META HTTP-EQUIV headers to regular HTTP headers.""" + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = seek_wrapper(response) + # grab HTTP-EQUIV headers and add them to the true HTTP headers + headers = response.info() + for hdr, val in parse_head(response): + headers[hdr] = val + response.seek(0) + return response + + https_response = http_response + + # XXX ATM this only takes notice of http responses -- probably + # should be independent of protocol scheme (http, ftp, etc.) + class SeekableProcessor(BaseProcessor): + """Make responses seekable.""" + + def http_response(self, request, response): + if not hasattr(response, "seek"): + return seek_wrapper(response) + return response + + https_response = http_response + + # XXX if this gets added to urllib2, unverifiable would end up as an + # attribute on Request. + class HTTPCookieProcessor(BaseProcessor): + """Handle HTTP cookies.""" + def __init__(self, cookies=None): + if cookies is None: + cookies = CookieJar() + self.cookies = cookies + + def _unverifiable(self, request): + if hasattr(request, "error_302_dict") and request.error_302_dict: + redirect = True + else: + redirect = False + if (redirect or + (hasattr(request, "unverifiable") and request.unverifiable)): + unverifiable = True + else: + unverifiable = False + return unverifiable + + def http_request(self, request): + unverifiable = self._unverifiable(request) + if not unverifiable: + # Stuff request-host of this origin transaction into Request + # object, because we need to know it to know whether cookies + # should be in operation during derived requests (redirects, + # specifically -- including refreshes). + request.origin_req_host = request_host(request) + self.cookies.add_cookie_header(request, unverifiable) + return request + + def http_response(self, request, response): + unverifiable = self._unverifiable(request) + self.cookies.extract_cookies(response, request, unverifiable) + return response + + https_request = http_request + https_response = http_response + + class HTTPRefererProcessor(BaseProcessor): + """Add Referer header to requests. + + This only makes sense if you use each RefererProcessor for a single + chain of requests only (so, for example, if you use a single + HTTPRefererProcessor to fetch a series of URLs extracted from a single + page, this will break). + + """ + def __init__(self): + self.referer = None + + def http_request(self, request): + if ((self.referer is not None) and + not request.has_key("Referer")): + request.add_unredirected_header("Referer", self.referer) + return request + + def http_response(self, request, response): + self.referer = response.geturl() + return response + + https_request = http_request + https_response = http_response + + class HTTPStandardHeadersProcessor(BaseProcessor): + def http_request(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_key('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_key('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + scheme, sel = urllib.splittype(request.get_selector()) + sel_host, sel_path = urllib.splithost(sel) + if not request.has_key('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = string.capitalize(name) + if not request.has_key(name): + request.add_unredirected_header(name, value) + + return request + + https_request = http_request + + class HTTPResponseDebugProcessor(BaseProcessor): + processor_order = 900 # before redirections, after everything else + + def http_response(self, request, response): + if not hasattr(response, "seek"): + response = seek_wrapper(response) + _debug(response.read()) + _debug("*****************************************************") + response.seek(0) + return response + + https_response = http_response + + class HTTPRefreshProcessor(BaseProcessor): + """Perform HTTP Refresh redirections. + + Note that if a non-200 HTTP code has occurred (for example, a 30x + redirect), this processor will do nothing. + + By default, only zero-time Refresh headers are redirected. Use the + max_time constructor argument to allow Refresh with longer pauses. + Use the honor_time argument to control whether the requested pause + is honoured (with a time.sleep()) or skipped in favour of immediate + redirection. + + """ + processor_order = 1000 + + def __init__(self, max_time=0, honor_time=True): + self.max_time = max_time + self.honor_time = honor_time + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code == 200 and hdrs.has_key("refresh"): + refresh = hdrs["refresh"] + i = string.find(refresh, ";") + if i != -1: + pause, newurl_spec = refresh[:i], refresh[i+1:] + i = string.find(newurl_spec, "=") + if i != -1: + pause = int(pause) + if pause <= self.max_time: + if pause != 0 and self.honor_time: + time.sleep(pause) + newurl = newurl_spec[i+1:] + # fake a 302 response + hdrs["location"] = newurl + response = self.parent.error( + 'http', request, response, 302, msg, hdrs) + + return response + + https_response = http_response + + class HTTPErrorProcessor(BaseProcessor): + """Process non-200 HTTP error responses. + + This just passes the job on to the Handler.<proto>_error_<code> + methods, via the OpenerDirector.error method. + + """ + processor_order = 1000 + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code != 200: + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + + + class OpenerDirector(urllib2.OpenerDirector): + # XXX might be useful to have remove_processor, too (say you want to + # set a new RefererProcessor, but keep the old CookieProcessor -- + # could always just create everything anew, though (using old + # CookieJar object to create CookieProcessor) + def __init__(self): + urllib2.OpenerDirector.__init__(self) + #self.processors = [] + self.process_response = {} + self.process_request = {} + + def add_handler(self, handler): + # XXX + # tidy me + # the same handler could be added twice without detection + added = 0 + for meth in dir(handler.__class__): + if meth[-5:] == '_open': + protocol = meth[:-5] + if self.handle_open.has_key(protocol): + self.handle_open[protocol].append(handler) + self.handle_open[protocol].sort() + else: + self.handle_open[protocol] = [handler] + added = 1 + continue + i = string.find(meth, '_') + j = string.find(meth[i+1:], '_') + i + 1 + if j != -1 and meth[i+1:j] == 'error': + proto = meth[:i] + kind = meth[j+1:] + try: + kind = int(kind) + except ValueError: + pass + dict = self.handle_error.get(proto, {}) + if dict.has_key(kind): + dict[kind].append(handler) + dict[kind].sort() + else: + dict[kind] = [handler] + self.handle_error[proto] = dict + added = 1 + continue + if meth[-9:] == "_response": + protocol = meth[:-9] + if self.process_response.has_key(protocol): + self.process_response[protocol].append(handler) + self.process_response[protocol].sort() + else: + self.process_response[protocol] = [handler] + added = True + continue + elif meth[-8:] == "_request": + protocol = meth[:-8] + if self.process_request.has_key(protocol): + self.process_request[protocol].append(handler) + self.process_request[protocol].sort() + else: + self.process_request[protocol] = [handler] + added = True + continue + if added: + self.handlers.append(handler) + self.handlers.sort() + handler.add_parent(self) + +## def add_processor(self, processor): +## added = False +## for meth in dir(processor): +## if meth[-9:] == "_response": +## protocol = meth[:-9] +## if self.process_response.has_key(protocol): +## self.process_response[protocol].append(processor) +## self.process_response[protocol].sort() +## else: +## self.process_response[protocol] = [processor] +## added = True +## continue +## elif meth[-8:] == "_request": +## protocol = meth[:-8] +## if self.process_request.has_key(protocol): +## self.process_request[protocol].append(processor) +## self.process_request[protocol].sort() +## else: +## self.process_request[protocol] = [processor] +## added = True +## continue +## if added: +## self.processors.append(processor) +## # XXX base class sorts .handlers, but I have no idea why +## #self.processors.sort() +## processor.add_parent(self) + + def _request(self, url_or_req, data): + if isstringlike(url_or_req): + req = Request(url_or_req, data) + else: + # already a urllib2.Request instance + req = url_or_req + if data is not None: + req.add_data(data) + return req + + def open(self, fullurl, data=None): + req = self._request(fullurl, data) + type = req.get_type() + + # pre-process request + # XXX should we allow a Processor to change the type (URL + # scheme) of the request? + meth_name = type+"_request" + for processor in self.process_request.get(type, []): + meth = getattr(processor, meth_name) + req = meth(req) + + response = urllib2.OpenerDirector.open(self, req, data) + + # post-process response + meth_name = type+"_response" + for processor in self.process_response.get(type, []): + meth = getattr(processor, meth_name) + response = meth(req, response) + + return response + +## def close(self): +## urllib2.OpenerDirector.close(self) +## for processor in self.processors: +## processor.close() +## self.processors = [] + + + # Note the absence of redirect and header-adding code here + # (AbstractHTTPHandler), and the lack of other clutter that would be + # here without Processors. + class AbstractHTTPHandler(urllib2.BaseHandler): + def do_open(self, http_class, req): + host = req.get_host() + if not host: + raise URLError('no host given') + + h = http_class(host) # will parse host:port + if ClientCookie.HTTP_DEBUG: + h.set_debuglevel(1) + + if req.has_data(): + h.putrequest('POST', req.get_selector()) + else: + h.putrequest('GET', req.get_selector()) + + for k, v in req.headers.items(): + h.putheader(k, v) + for k, v in req.unredirected_hdrs.items(): + h.putheader(k, v) + + # httplib will attempt to connect() here. be prepared + # to convert a socket error to a URLError. + try: + h.endheaders() + except socket.error, err: + raise URLError(err) + if req.has_data(): + h.send(req.get_data()) + + code, msg, hdrs = h.getreply() + fp = h.getfile() + + response = urllib.addinfourl(fp, hdrs, req.get_full_url()) + response.code = code + response.msg = msg + + return response + + + # XXX would self.reset() work, instead of raising this exception? + class EndOfHeadError(Exception): pass + class HeadParser(htmllib.HTMLParser): + # only these elements are allowed in or before HEAD of document + head_elems = ("html", "head", + "title", "base", + "script", "style", "meta", "link", "object") + def __init__(self): + htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) + self.http_equiv = [] + + def start_meta(self, attrs): + http_equiv = content = None + for key, value in attrs: + if key == "http-equiv": + http_equiv = value + elif key == "content": + content = value + if http_equiv is not None: + self.http_equiv.append((http_equiv, content)) + + def handle_starttag(self, tag, method, attrs): + if tag in self.head_elems: + method(attrs) + else: + raise EndOfHeadError() + + def handle_endtag(self, tag, method): + if tag in self.head_elems: + method() + else: + raise EndOfHeadError() + + def end_head(self): + raise EndOfHeadError() + + def parse_head(file): + """Return a list of key, value pairs.""" + hp = HeadParser() + while 1: + data = file.read(CHUNK) + try: + hp.feed(data) + except EndOfHeadError: + break + if len(data) != CHUNK: + # this should only happen if there is no HTML body, or if + # CHUNK is big + break + return hp.http_equiv + + + class HTTPHandler(AbstractHTTPHandler): + def http_open(self, req): + return self.do_open(httplib.HTTP, req) + + if hasattr(httplib, 'HTTPS'): + class HTTPSHandler(AbstractHTTPHandler): + def https_open(self, req): + return self.do_open(httplib.HTTPS, req) + + + def build_opener(*handlers): + """Create an opener object from a list of handlers and processors. + + The opener will use several default handlers and processors, including + support for HTTP and FTP. If there is a ProxyHandler, it must be at the + front of the list of handlers. (Yuck. This is fixed in 2.3.) + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + """ + opener = OpenerDirector() + default_classes = [ + # handlers + urllib2.ProxyHandler, + urllib2.UnknownHandler, + HTTPHandler, # from this module (derived from new AbstractHTTPHandler) + urllib2.HTTPDefaultErrorHandler, + HTTPRedirectHandler, # from this module (bugfixed) + urllib2.FTPHandler, + urllib2.FileHandler, + # processors + HTTPRequestUpgradeProcessor, + #HTTPEquivProcessor, + #SeekableProcessor, + HTTPCookieProcessor, + #HTTPRefererProcessor, + HTTPStandardHeadersProcessor, + #HTTPRefreshProcessor, + HTTPErrorProcessor + ] + if hasattr(httplib, 'HTTPS'): + default_classes.append(HTTPSHandler) + skip = [] + for klass in default_classes: + for check in handlers: + if type(check) == types.ClassType: + if issubclass(check, klass): + skip.append(klass) + elif type(check) == types.InstanceType: + if isinstance(check, klass): + skip.append(klass) + for klass in skip: + default_classes.remove(klass) + + to_add = [] + for klass in default_classes: + to_add.append(klass()) + for h in handlers: + if type(h) == types.ClassType: + h = h() + to_add.append(h) + + for instance in to_add: + opener.add_handler(instance) +## # yuck +## if hasattr(instance, "processor_order"): +## opener.add_processor(instance) +## else: +## opener.add_handler(instance) + + return opener + + + _opener = None + urlopen_lock = _threading.Lock() + def urlopen(url, data=None): + global _opener + if _opener is None: + urlopen_lock.acquire() + try: + if _opener is None: + _opener = build_opener() + finally: + urlopen_lock.release() + return _opener.open(url, data) + + def install_opener(opener): + global _opener + _opener = opener |