diff options
author | Taybin Rutkin <taybin@taybin.com> | 2005-05-13 20:47:18 +0000 |
---|---|---|
committer | Taybin Rutkin <taybin@taybin.com> | 2005-05-13 20:47:18 +0000 |
commit | d09f6b3016bacbc2871a8946cbb24ad705076509 (patch) | |
tree | f27312839c2a772cb2ce068a4f28b2449ad869df /tools/bug_tool/ClientForm.py |
Initial revision
git-svn-id: svn://localhost/trunk/ardour2@4 d708f5d6-7413-0410-9779-e7cbd77b26cf
Diffstat (limited to 'tools/bug_tool/ClientForm.py')
-rw-r--r-- | tools/bug_tool/ClientForm.py | 2699 |
1 files changed, 2699 insertions, 0 deletions
diff --git a/tools/bug_tool/ClientForm.py b/tools/bug_tool/ClientForm.py new file mode 100644 index 0000000000..c42f65b313 --- /dev/null +++ b/tools/bug_tool/ClientForm.py @@ -0,0 +1,2699 @@ +"""HTML form handling for web clients. + +ClientForm is a Python module for handling HTML forms on the client +side, useful for parsing HTML forms, filling them in and returning the +completed forms to the server. It has developed from a port of Gisle +Aas' Perl module HTML::Form, from the libwww-perl library, but the +interface is not the same. + +The most useful docstring is the one for HTMLForm. + +RFC 1866: HTML 2.0 +RFC 1867: Form-based File Upload in HTML +RFC 2388: Returning Values from Forms: multipart/form-data +HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX) +HTML 4.01 Specification, W3C Recommendation 24 December 1999 + + +Copyright 2002-2003 John J. Lee <jjl@pobox.com> +Copyright 1998-2000 Gisle Aas. + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file COPYING included with +the distribution). + +""" + +# XXX +# Treat unknown controls as text controls? (this was a recent LWP +# HTML::Form change) I guess this is INPUT with no TYPE? Check LWP +# source and browser behaviour. +# Support for list item ids. How to handle missing ids? (How do I deal +# with duplicate OPTION labels ATM? Can't remember...) +# Arrange things so can automatically PyPI-register with categories +# without messing up 1.5.2 compatibility. +# Tests need work. +# Test single and multiple file upload some more on the web. +# Does file upload work when name is missing? Sourceforge tracker form +# doesn't like it. Check standards, and test with Apache. Test binary +# upload with Apache. +# Add label support for CHECKBOX and RADIO. +# Better docs. +# Deal with character sets properly. Not sure what the issues are here. +# I don't *think* any encoding of control names, filenames or data is +# necessary -- HTML spec. doesn't require it, and Mozilla Firebird 0.6 +# doesn't seem to do it. +# Add charset parameter to Content-type headers? How to find value?? +# Get rid of MapBase, AList and MimeWriter. +# I'm not going to fix this unless somebody tells me what real servers +# that want this encoding actually expect: If enctype is +# application/x-www-form-urlencoded and there's a FILE control present. +# Strictly, it should be 'name=data' (see HTML 4.01 spec., section +# 17.13.2), but I send "name=" ATM. What about multiple file upload?? +# Get rid of the two type-switches (for kind and click*). +# Remove single-selection code: can be special case of multi-selection, +# with a few variations, I think. +# Factor out multiple-selection list code? May not be easy. Maybe like +# this: + +# ListControl +# ^ +# | MultipleListControlMixin +# | ^ +# SelectControl / +# ^ / +# \ / +# MultiSelectControl + + +# Plan +# ---- +# Maybe a 0.2.x, cleaned up a bit and with id support for list items? +# Not sure it's worth it, really. +# Remove toggle methods. +# Replace by_label with choice between value / id / label / +# element contents (see discussion with Gisle about labels on +# libwww-perl list). +# ...what else? +# Work on DOMForm. +# XForms? Don't know if there's a need here. + + +try: True +except NameError: + True = 1 + False = 0 + +try: bool +except NameError: + def bool(expr): + if expr: return True + else: return False + +import sys, urllib, urllib2, types, string, mimetools, copy +from urlparse import urljoin +from cStringIO import StringIO +try: + import UnicodeType +except ImportError: + UNICODE = False +else: + UNICODE = True + +VERSION = "0.1.13" + +CHUNK = 1024 # size of chunks fed to parser, in bytes + +# This version of urlencode is from my Python 1.5.2 back-port of the +# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence +# of pairs instead of a mapping -- the 2.0 version only accepts a mapping. +def urlencode(query,doseq=False,): + """Encode a sequence of two-element tuples or dictionary into a URL query \ +string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + x = len(query) + # non-empty strings will fail this + if len(query) and type(query[0]) != types.TupleType: + raise TypeError() + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping " + "object", tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = urllib.quote_plus(str(k)) + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = urllib.quote_plus(str(k)) + if type(v) == types.StringType: + v = urllib.quote_plus(v) + l.append(k + '=' + v) + elif UNICODE and type(v) == types.UnicodeType: + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = urllib.quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = urllib.quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + urllib.quote_plus(str(elt))) + return string.join(l, '&') + +def startswith(string, initial): + if len(initial) > len(string): return False + return string[:len(initial)] == initial + +def issequence(x): + try: + x[0] + except (TypeError, KeyError): + return False + except IndexError: + pass + return True + +def isstringlike(x): + try: x+"" + except: return False + else: return True + + +# XXX don't really want to drag this along (MapBase, AList, MimeWriter) + +class MapBase: + """Mapping designed to be easily derived from. + + Subclass it and override __init__, __setitem__, __getitem__, __delitem__ + and keys. Nothing else should need to be overridden, unlike UserDict. + This significantly simplifies dictionary-like classes. + + Also different from UserDict in that it has a redonly flag, and can be + updated (and initialised) with a sequence of pairs (key, value). + + """ + def __init__(self, init=None): + self._data = {} + self.readonly = False + if init is not None: self.update(init) + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, item): + if not self.readonly: + self._data[key] = item + else: + raise TypeError("object doesn't support item assignment") + + def __delitem__(self, key): + if not self.readonly: + del self._data[key] + else: + raise TypeError("object doesn't support item deletion") + + def keys(self): + return self._data.keys() + + # now the internal workings, there should be no need to override these: + + def clear(self): + for k in self.keys(): + del self[k] + + def __repr__(self): + rep = [] + for k, v in self.items(): + rep.append("%s: %s" % (repr(k), repr(v))) + return self.__class__.__name__+"{"+(string.join(rep, ", "))+"}" + + def copy(self): + return copy.copy(self) + + def __cmp__(self, dict): + # note: return value is *not* boolean + for k, v in self.items(): + if not (dict.has_key(k) and dict[k] == v): + return 1 # different + return 0 # the same + + def __len__(self): + return len(self.keys()) + + def values(self): + r = [] + for k in self.keys(): + r.append(self[k]) + return r + + def items(self): + keys = self.keys() + vals = self.values() + r = [] + for i in len(self): + r.append((keys[i], vals[i])) + return r + + def has_key(self, key): + return key in self.keys() + + def update(self, map): + if issequence(map) and not isstringlike(map): + items = map + else: + items = map.items() + for tup in items: + if not isinstance(tup, TupleType): + raise TypeError( + "MapBase.update requires a map or a sequence of pairs") + k, v = tup + self[k] = v + + def get(self, key, failobj=None): + if key in self.keys(): + return self[key] + else: + return failobj + + def setdefault(self, key, failobj=None): + if not self.has_key(key): + self[key] = failobj + return self[key] + + +class AList(MapBase): + """Read-only ordered mapping.""" + def __init__(self, seq=[]): + self.readonly = True + self._inverted = False + self._data = list(seq[:]) + self._keys = [] + self._values = [] + for key, value in seq: + self._keys.append(key) + self._values.append(value) + + def set_inverted(self, inverted): + if (inverted and not self._inverted) or ( + not inverted and self._inverted): + self._keys, self._values = self._values, self._keys + if inverted: self._inverted = True + else: self._inverted = False + + def __getitem__(self, key): + try: + i = self._keys.index(key) + except ValueError: + raise KeyError(key) + return self._values[i] + + def __delitem__(self, key): + try: + i = self._keys.index[key] + except ValueError: + raise KeyError(key) + del self._values[i] + + def keys(self): return list(self._keys[:]) + def values(self): return list(self._values[:]) + def items(self): + data = self._data[:] + if not self._inverted: + return data + else: + newdata = [] + for k, v in data: + newdata.append((v, k)) + return newdata + + +# This cut-n-pasted MimeWriter from standard library is here so can add +# to HTTP headers rather than message body when appropriate. It also uses +# \r\n in place of \n. This is nasty. +class MimeWriter: + + """Generic MIME writer. + + Methods: + + __init__() + addheader() + flushheaders() + startbody() + startmultipartbody() + nextpart() + lastpart() + + A MIME writer is much more primitive than a MIME parser. It + doesn't seek around on the output file, and it doesn't use large + amounts of buffer space, so you have to write the parts in the + order they should occur on the output file. It does buffer the + headers you add, allowing you to rearrange their order. + + General usage is: + + f = <open the output file> + w = MimeWriter(f) + ...call w.addheader(key, value) 0 or more times... + + followed by either: + + f = w.startbody(content_type) + ...call f.write(data) for body data... + + or: + + w.startmultipartbody(subtype) + for each part: + subwriter = w.nextpart() + ...use the subwriter's methods to create the subpart... + w.lastpart() + + The subwriter is another MimeWriter instance, and should be + treated in the same way as the toplevel MimeWriter. This way, + writing recursive body parts is easy. + + Warning: don't forget to call lastpart()! + + XXX There should be more state so calls made in the wrong order + are detected. + + Some special cases: + + - startbody() just returns the file passed to the constructor; + but don't use this knowledge, as it may be changed. + + - startmultipartbody() actually returns a file as well; + this can be used to write the initial 'if you can read this your + mailer is not MIME-aware' message. + + - If you call flushheaders(), the headers accumulated so far are + written out (and forgotten); this is useful if you don't need a + body part at all, e.g. for a subpart of type message/rfc822 + that's (mis)used to store some header-like information. + + - Passing a keyword argument 'prefix=<flag>' to addheader(), + start*body() affects where the header is inserted; 0 means + append at the end, 1 means insert at the start; default is + append for addheader(), but insert for start*body(), which use + it to determine where the Content-type header goes. + + """ + + def __init__(self, fp, http_hdrs=None): + self._http_hdrs = http_hdrs + self._fp = fp + self._headers = [] + self._boundary = [] + self._first_part = True + + def addheader(self, key, value, prefix=0, + add_to_http_hdrs=0): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + lines = string.split(value, "\r\n") + while lines and not lines[-1]: del lines[-1] + while lines and not lines[0]: del lines[0] + if add_to_http_hdrs: + value = string.join(lines, "") + self._http_hdrs.append((key, value)) + else: + for i in range(1, len(lines)): + lines[i] = " " + string.strip(lines[i]) + value = string.join(lines, "\r\n") + "\r\n" + line = key + ": " + value + if prefix: + self._headers.insert(0, line) + else: + self._headers.append(line) + + def flushheaders(self): + self._fp.writelines(self._headers) + self._headers = [] + + def startbody(self, ctype=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + """ + prefix is ignored if add_to_http_hdrs is true. + """ + if content_type and ctype: + for name, value in plist: + ctype = ctype + ';\r\n %s=\"%s\"' % (name, value) + self.addheader("Content-type", ctype, prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs) + self.flushheaders() + if not add_to_http_hdrs: self._fp.write("\r\n") + self._first_part = True + return self._fp + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1, + add_to_http_hdrs=0, content_type=1): + boundary = boundary or mimetools.choose_boundary() + self._boundary.append(boundary) + return self.startbody("multipart/" + subtype, + [("boundary", boundary)] + plist, + prefix=prefix, + add_to_http_hdrs=add_to_http_hdrs, + content_type=content_type) + + def nextpart(self): + boundary = self._boundary[-1] + if self._first_part: + self._first_part = False + else: + self._fp.write("\r\n") + self._fp.write("--" + boundary + "\r\n") + return self.__class__(self._fp) + + def lastpart(self): + if self._first_part: + self.nextpart() + boundary = self._boundary.pop() + self._fp.write("\r\n--" + boundary + "--\r\n") + + +class ControlNotFoundError(ValueError): pass +class ItemNotFoundError(ValueError): pass +class ItemCountError(ValueError): pass + +class ParseError(Exception): pass + + +def ParseResponse(response, select_default=False, ignore_errors=False): + """Parse HTTP response and return a list of HTMLForm instances. + + The return value of urllib2.urlopen can be conveniently passed to this + function as the response parameter. + + ClientForm.ParseError is raised on parse errors. + + response: file-like object (supporting read() method) with a method + geturl(), returning the base URI of the HTTP response + select_default: for multiple-selection SELECT controls and RADIO controls, + pick the first item as the default if none are selected in the HTML + ignore_errors: don't raise ParseError, and carry on regardless if the + parser gets confused + + Pass a true value for select_default if you want the behaviour specified by + RFC 1866 (the HTML 2.0 standard), which is to select the first item in a + RADIO or multiple-selection SELECT control if none were selected in the + HTML. Most browsers (including Microsoft Internet Explorer (IE) and + Netscape Navigator) instead leave all items unselected in these cases. The + W3C HTML 4.0 standard leaves this behaviour undefined in the case of + multiple-selection SELECT controls, but insists that at least one RADIO + button should be checked at all times, in contradiction to browser + behaviour. + + Precisely what ignore_errors does isn't well-defined yet, so don't rely too + much on the current behaviour -- if you want robustness, you're better off + fixing the HTML before passing it to this function. + + """ + return ParseFile(response, response.geturl(), select_default) + +def ParseFile(file, base_uri, select_default=False, ignore_errors=False): + """Parse HTML and return a list of HTMLForm instances. + + ClientForm.ParseError is raised on parse errors. + + file: file-like object (supporting read() method) containing HTML with zero + or more forms to be parsed + base_uri: the base URI of the document + + For the other arguments and further details, see ParseResponse.__doc__. + + """ + fp = _FORM_PARSER_CLASS(ignore_errors) + while 1: + data = file.read(CHUNK) + fp.feed(data) + if len(data) != CHUNK: break + forms = [] + for (name, action, method, enctype), attrs, controls in fp.forms: + if action is None: + action = base_uri + else: + action = urljoin(base_uri, action) + form = HTMLForm(action, method, enctype, name, attrs) + for type, name, attr in controls: + form.new_control(type, name, attr, select_default=select_default) + forms.append(form) + for form in forms: + form.fixup() + return forms + + +class _AbstractFormParser: + """forms attribute contains HTMLForm instances on completion.""" + # pinched (and modified) from Moshe Zadka + def __init__(self, ignore_errors, entitydefs=None): + if entitydefs is not None: + self.entitydefs = entitydefs + self._ignore_errors = ignore_errors + self.forms = [] + self._current_form = None + self._select = None + self._optgroup = None + self._option = None + self._textarea = None + + def error(self, error): + if not self._ignore_errors: raise error + + def start_form(self, attrs): + if self._current_form is not None: + self.error(ParseError("nested FORMs")) + name = None + action = None + enctype = "application/x-www-form-urlencoded" + method = "GET" + d = {} + for key, value in attrs: + if key == "name": + name = value + elif key == "action": + action = value + elif key == "method": + method = string.upper(value) + elif key == "enctype": + enctype = string.lower(value) + else: + d[key] = value + controls = [] + self._current_form = (name, action, method, enctype), d, controls + + def end_form(self): + if self._current_form is None: + self.error(ParseError("end of FORM before start")) + self.forms.append(self._current_form) + self._current_form = None + + def start_select(self, attrs): + if self._current_form is None: + self.error(ParseError("start of SELECT before start of FORM")) + if self._select is not None: + self.error(ParseError("nested SELECTs")) + if self._textarea is not None: + self.error(ParseError("SELECT inside TEXTAREA")) + d = {} + for key, val in attrs: + d[key] = val + + self._select = d + + self._append_select_control({"__select": d}) + + def end_select(self): + if self._current_form is None: + self.error(ParseError("end of SELECT before start of FORM")) + if self._select is None: + self.error(ParseError("end of SELECT before start")) + + if self._option is not None: + self._end_option() + + self._select = None + + def start_optgroup(self, attrs): + if self._select is None: + self.error(ParseError("OPTGROUP outside of SELECT")) + d = {} + for key, val in attrs: + d[key] = val + + self._optgroup = d + + def end_optgroup(self): + if self._optgroup is None: + self.error(ParseError("end of OPTGROUP before start")) + self._optgroup = None + + def _start_option(self, attrs): + if self._select is None: + self.error(ParseError("OPTION outside of SELECT")) + if self._option is not None: + self._end_option() + + d = {} + for key, val in attrs: + d[key] = val + + self._option = {} + self._option.update(d) + if (self._optgroup and self._optgroup.has_key("disabled") and + not self._option.has_key("disabled")): + self._option["disabled"] = None + + def _end_option(self): + if self._option is None: + self.error(ParseError("end of OPTION before start")) + + contents = string.strip(self._option.get("contents", "")) + #contents = string.strip(self._option["contents"]) + self._option["contents"] = contents + if not self._option.has_key("value"): + self._option["value"] = contents + if not self._option.has_key("label"): + self._option["label"] = contents + # stuff dict of SELECT HTML attrs into a special private key + # (gets deleted again later) + self._option["__select"] = self._select + self._append_select_control(self._option) + self._option = None + + def _append_select_control(self, attrs): + controls = self._current_form[2] + name = self._select.get("name") + controls.append(("select", name, attrs)) + +## def do_option(self, attrs): +## if self._select is None: +## self.error(ParseError("OPTION outside of SELECT")) +## d = {} +## for key, val in attrs: +## d[key] = val + +## self._option = {} +## self._option.update(d) +## if (self._optgroup and self._optgroup.has_key("disabled") and +## not self._option.has_key("disabled")): +## self._option["disabled"] = None + + def start_textarea(self, attrs): + if self._current_form is None: + self.error(ParseError("start of TEXTAREA before start of FORM")) + if self._textarea is not None: + self.error(ParseError("nested TEXTAREAs")) + if self._select is not None: + self.error(ParseError("TEXTAREA inside SELECT")) + d = {} + for key, val in attrs: + d[key] = val + + self._textarea = d + + def end_textarea(self): + if self._current_form is None: + self.error(ParseError("end of TEXTAREA before start of FORM")) + if self._textarea is None: + self.error(ParseError("end of TEXTAREA before start")) + controls = self._current_form[2] + name = self._textarea.get("name") + controls.append(("textarea", name, self._textarea)) + self._textarea = None + + def handle_data(self, data): + if self._option is not None: + # self._option is a dictionary of the OPTION element's HTML + # attributes, but it has two special keys, one of which is the + # special "contents" key contains text between OPTION tags (the + # other is the "__select" key: see the end_option method) + map = self._option + key = "contents" + elif self._textarea is not None: + map = self._textarea + key = "value" + else: + return + + if not map.has_key(key): + map[key] = data + else: + map[key] = map[key] + data + +## def handle_data(self, data): +## if self._option is not None: +## contents = string.strip(data) +## controls = self._current_form[2] +## if not self._option.has_key("value"): +## self._option["value"] = contents +## if not self._option.has_key("label"): +## self._option["label"] = contents +## # self._option is a dictionary of the OPTION element's HTML +## # attributes, but it has two special keys: +## # 1. special "contents" key contains text between OPTION tags +## self._option["contents"] = contents +## # 2. stuff dict of SELECT HTML attrs into a special private key +## # (gets deleted again later) +## self._option["__select"] = self._select +## self._append_select_control(self._option) +## self._option = None +## elif self._textarea is not None: +## #self._textarea["value"] = data +## if self._textarea.get("value") is None: +## self._textarea["value"] = data +## else: +## self._textarea["value"] = self._textarea["value"] + data + + def do_button(self, attrs): + if self._current_form is None: + self.error(ParseError("start of BUTTON before start of FORM")) + d = {} + d["type"] = "submit" # default + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + # we don't want to lose information, so use a type string that + # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON} + # eg. type for BUTTON/RESET is "resetbutton" + # (type for INPUT/RESET is "reset") + type = type+"button" + controls.append((type, name, d)) + + def do_input(self, attrs): + if self._current_form is None: + self.error(ParseError("start of INPUT before start of FORM")) + d = {} + d["type"] = "text" # default + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + type = d["type"] + name = d.get("name") + controls.append((type, name, d)) + + def do_isindex(self, attrs): + if self._current_form is None: + self.error(ParseError("start of ISINDEX before start of FORM")) + d = {} + for key, val in attrs: + d[key] = val + controls = self._current_form[2] + + # isindex doesn't have type or name HTML attributes + controls.append(("isindex", None, d)) + +# use HTMLParser if we have it (it does XHTML), htmllib otherwise +try: + import HTMLParser +except ImportError: + import htmllib, formatter + class _FormParser(_AbstractFormParser, htmllib.HTMLParser): + # This is still here for compatibility with Python 1.5.2. + # It doesn't do the right thing with XHTML. + def __init__(self, ignore_errors, entitydefs=None): + htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) + _AbstractFormParser.__init__(self, ignore_errors, entitydefs) + + def do_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + _FORM_PARSER_CLASS = _FormParser +else: + class _XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser): + # thanks to Michael Howitz for this! + def __init__(self, ignore_errors, entitydefs=None): + HTMLParser.HTMLParser.__init__(self) + _AbstractFormParser.__init__(self, ignore_errors, entitydefs) + + def start_option(self, attrs): + _AbstractFormParser._start_option(self, attrs) + + def end_option(self): + _AbstractFormParser._end_option(self) + + def handle_starttag(self, tag, attrs): + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + pass # unknown tag + else: + method(attrs) + else: + method(attrs) + + def handle_endtag(self, tag): + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + pass # unknown tag + else: + method() + + # handle_charref, handle_entityref and default entitydefs are taken + # from sgmllib + def handle_charref(self, name): + try: + n = int(name) + except ValueError: + self.unknown_charref(name) + return + if not 0 <= n <= 255: + self.unknown_charref(name) + return + self.handle_data(chr(n)) + + # Definition of entities -- derived classes may override + entitydefs = \ + {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} + + def handle_entityref(self, name): + table = self.entitydefs + if name in table: + self.handle_data(table[name]) + else: + self.unknown_entityref(name) + return + + # These methods would have passed through the ref intact if I'd thought + # of it earlier, but since the old parser silently swallows unknown + # refs, so does this new parser. + def unknown_entityref(self, ref): pass + def unknown_charref(self, ref): pass + + _FORM_PARSER_CLASS = _XHTMLCompatibleFormParser + + +class Control: + """An HTML form control. + + An HTMLForm contains a sequence of Controls. HTMLForm delegates lots of + things to Control objects, and most of Control's methods are, in effect, + documented by the HTMLForm docstrings. + + The Controls in an HTMLForm can be got at via the HTMLForm.find_control + method or the HTMLForm.controls attribute. + + Control instances are usually constructed using the ParseFile / + ParseResponse functions, so you can probably ignore the rest of this + paragraph. A Control is only properly initialised after the fixup method + has been called. In fact, this is only strictly necessary for ListControl + instances. This is necessary because ListControls are built up from + ListControls each containing only a single item, and their initial value(s) + can only be known after the sequence is complete. + + The types and values that are acceptable for assignment to the value + attribute are defined by subclasses. + + If the disabled attribute is true, this represents the state typically + represented by browsers by `greying out' a control. If the disabled + attribute is true, the Control will raise AttributeError if an attempt is + made to change its value. In addition, the control will not be considered + `successful' as defined by the W3C HTML 4 standard -- ie. it will + contribute no data to the return value of the HTMLForm.click* methods. To + enable a control, set the disabled attribute to a false value. + + If the readonly attribute is true, the Control will raise AttributeError if + an attempt is made to change its value. To make a control writable, set + the readonly attribute to a false value. + + All controls have the disabled and readonly attributes, not only those that + may have the HTML attributes of the same names. + + On assignment to the value attribute, the following exceptions are raised: + TypeError, AttributeError (if the value attribute should not be assigned + to, because the control is disabled, for example) and ValueError. + + If the name or value attributes are None, or the value is an empty list, or + if the control is disabled, the control is not successful. + + Public attributes: + + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) (readonly) + name: name of control (readonly) + value: current value of control (subclasses may allow a single value, a + sequence of values, or either) + disabled: disabled state + readonly: readonly state + id: value of id HTML attribute + + """ + def __init__(self, type, name, attrs): + """ + type: string describing type of control (see the keys of the + HTMLForm.type2class dictionary for the allowable values) + name: control name + attrs: HTML attributes of control's HTML element + + """ + raise NotImplementedError() + + def add_to_form(self, form): + form.controls.append(self) + + def fixup(self): + pass + + def __getattr__(self, name): raise NotImplementedError() + def __setattr__(self, name, value): raise NotImplementedError() + + def pairs(self): + """Return list of (key, value) pairs suitable for passing to urlencode. + """ + raise NotImplementedError() + + def _write_mime_data(self, mw): + """Write data for this control to a MimeWriter.""" + # called by HTMLForm + for name, value in self.pairs(): + mw2 = mw.nextpart() + mw2.addheader("Content-disposition", + 'form-data; name="%s"' % name, 1) + f = mw2.startbody(prefix=0) + f.write(value) + + def __str__(self): + raise NotImplementedError() + + +#--------------------------------------------------- +class ScalarControl(Control): + """Control whose value is not restricted to one of a prescribed set. + + Some ScalarControls don't accept any value attribute. Otherwise, takes a + single value, which must be string-like. + + Additional read-only public attribute: + + attrs: dictionary mapping the names of original HTML attributes of the + control to their values + + """ + def __init__(self, type, name, attrs): + self.__dict__["type"] = string.lower(type) + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = attrs.has_key("disabled") + self.readonly = attrs.has_key("readonly") + self.id = attrs.get("id") + + self.attrs = attrs.copy() + + self._clicked = False + + def __getattr__(self, name): + if name == "value": + return self.__dict__["_value"] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if not isstringlike(value): + raise TypeError("must assign a string") + elif self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + elif self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + self.__dict__["_value"] = value + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def pairs(self): + name = self.name + value = self.value + if name is None or value is None or self.disabled: + return [] + return [(name, value)] + + def __str__(self): + name = self.name + value = self.value + if name is None: name = "<None>" + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class TextControl(ScalarControl): + """Textual input control. + + Covers: + + INPUT/TEXT + INPUT/PASSWORD + INPUT/FILE + INPUT/HIDDEN + TEXTAREA + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + if self.type == "hidden": self.readonly = True + if self._value is None: + self._value = "" + + +#--------------------------------------------------- +class FileControl(ScalarControl): + """File upload with INPUT TYPE=FILE. + + The value attribute of a FileControl is always None. + + Additional public method: add_file + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self._value = None + self._upload_data = [] + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def add_file(self, file_object, content_type=None, filename=None): + if not hasattr(file_object, "read"): + raise TypeError("file-like object must have read method") + if content_type is not None and not isstringlike(content_type): + raise TypeError("content type must be None or string-like") + if filename is not None and not isstringlike(filename): + raise TypeError("filename must be None or string-like") + if content_type is None: + content_type = "application/octet-stream" + self._upload_data.append((file_object, content_type, filename)) + + def pairs(self): + # XXX should it be successful even if unnamed? + if self.name is None or self.disabled: + return [] + return [(self.name, "")] + + def _write_mime_data(self, mw): + # called by HTMLForm + if len(self._upload_data) == 1: + # single file + file_object, content_type, filename = self._upload_data[0] + mw2 = mw.nextpart() + fn_part = filename and ('; filename="%s"' % filename) or '' + disp = 'form-data; name="%s"%s' % (self.name, fn_part) + mw2.addheader("Content-disposition", disp, prefix=1) + fh = mw2.startbody(content_type, prefix=0) + fh.write(file_object.read()) + elif len(self._upload_data) != 0: + # multiple files + mw2 = mw.nextpart() + disp = 'form-data; name="%s"' % self.name + mw2.addheader("Content-disposition", disp, prefix=1) + fh = mw2.startmultipartbody("mixed", prefix=0) + for file_object, content_type, filename in self._upload_data: + mw3 = mw2.nextpart() + fn_part = filename and ('; filename="%s"' % filename) or '' + disp = 'file%s' % fn_part + mw3.addheader("Content-disposition", disp, prefix=1) + fh2 = mw3.startbody(content_type, prefix=0) + fh2.write(file_object.read()) + mw2.lastpart() + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + if not self._upload_data: + value = "<No files added>" + else: + value = [] + for file, ctype, filename in self._upload_data: + if filename is None: + value.append("<Unnamed file>") + else: + value.append(filename) + value = string.join(value, ", ") + + info = [] + if self.disabled: info.append("disabled") + if self.readonly: info.append("readonly") + info = string.join(info, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info) + + +#--------------------------------------------------- +class IsindexControl(ScalarControl): + """ISINDEX control. + + ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really + part of regular HTML forms at all, and predates it. You're only allowed + one ISINDEX per HTML document. ISINDEX and regular form submission are + mutually exclusive -- either submit a form, or the ISINDEX. + + Having said this, since ISINDEX controls may appear in forms (which is + probably bad HTML), ParseFile / ParseResponse will include them in the + HTMLForm instances it returns. You can set the ISINDEX's value, as with + any other control (but note that ISINDEX controls have no name, so you'll + need to use the type argument of set_value!). When you submit the form, + the ISINDEX will not be successful (ie., no data will get returned to the + server as a result of its presence), unless you click on the ISINDEX + control, in which case the ISINDEX gets submitted instead of the form: + + form.set_value("my isindex value", type="isindex") + urllib2.urlopen(form.click(type="isindex")) + + ISINDEX elements outside of FORMs are ignored. If you want to submit one + by hand, do it like so: + + url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value")) + result = urllib2.urlopen(url) + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + if self._value is None: + self._value = "" + + def pairs(self): + return [] + + def _click(self, form, coord, return_type): + # Relative URL for ISINDEX submission: instead of "foo=bar+baz", + # want "bar+baz". + # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is + # deprecated in 4.01, but it should still say how to submit it). + # Submission of ISINDEX is explained in the HTML 3.2 spec, though. + url = urljoin(form.action, "?"+urllib.quote_plus(self.value)) + req_data = url, None, [] + + if return_type == "pairs": + return [] + elif return_type == "request_data": + return req_data + else: + return urllib2.Request(url) + + def __str__(self): + value = self.value + if value is None: value = "<None>" + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s)%s>" % (self.__class__.__name__, value, info) + + +#--------------------------------------------------- +class IgnoreControl(ScalarControl): + """Control that we're not interested in. + + Covers: + + INPUT/RESET + BUTTON/RESET + INPUT/BUTTON + BUTTON/BUTTON + + These controls are always unsuccessful, in the terminology of HTML 4 (ie. + they never require any information to be returned to the server). + + BUTTON/BUTTON is used to generate events for script embedded in HTML. + + The value attribute of IgnoreControl is always None. + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self._value = None + + def __setattr__(self, name, value): + if name == "value": + raise AttributeError( + "control '%s' is ignored, hence read-only" % self.name) + elif name in ("name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + +#--------------------------------------------------- +class ListControl(Control): + """Control representing a sequence of items. + + The value attribute of a ListControl represents the selected list items in + the control. + + ListControl implements both list controls that take a single value and + those that take multiple values. + + ListControls accept sequence values only. Some controls only accept + sequences of length 0 or 1 (RADIO, and single-selection SELECT). + In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes + and multiple-selection SELECTs (those having the "multiple" HTML attribute) + accept sequences of any length. + + Note the following mistake: + + control.value = some_value + assert control.value == some_value # not necessarily true + + The reason for this is that the value attribute always gives the list items + in the order they were listed in the HTML. + + ListControl items can also be referred to by their labels instead of names. + Use the by_label argument, and the set_value_by_label, get_value_by_label + methods. + + XXX RadioControl and CheckboxControl don't implement by_label yet. + + Note that, rather confusingly, though SELECT controls are represented in + HTML by SELECT elements (which contain OPTION elements, representing + individual list items), CHECKBOXes and RADIOs are not represented by *any* + element. Instead, those controls are represented by a collection of INPUT + elements. For example, this is a SELECT control, named "control1": + + <select name="control1"> + <option>foo</option> + <option value="1">bar</option> + </select> + + and this is a CHECKBOX control, named "control2": + + <input type="checkbox" name="control2" value="foo" id="cbe1"> + <input type="checkbox" name="control2" value="bar" id="cbe2"> + + The id attribute of a CHECKBOX or RADIO ListControl is always that of its + first element (for example, "cbe1" above). + + + Additional read-only public attribute: multiple. + + + ListControls are built up by the parser from their component items by + creating one ListControl per item, consolidating them into a single master + ListControl held by the HTMLForm: + + -User calls form.new_control(...) + -Form creates Control, and calls control.add_to_form(self). + -Control looks for a Control with the same name and type in the form, and + if it finds one, merges itself with that control by calling + control.merge_control(self). The first Control added to the form, of a + particular name and type, is the only one that survives in the form. + -Form calls control.fixup for all its controls. ListControls in the form + know they can now safely pick their default values. + + To create a ListControl without an HTMLForm, use: + + control.merge_control(new_control) + + """ + def __init__(self, type, name, attrs={}, select_default=False, + called_as_base_class=False): + """ + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present + + """ + if not called_as_base_class: + raise NotImplementedError() + + self.__dict__["type"] = string.lower(type) + self.__dict__["name"] = name + self._value = attrs.get("value") + self.disabled = False + self.readonly = False + self.id = attrs.get("id") + + self._attrs = attrs.copy() + # As Controls are merged in with .merge_control(), self._attrs will + # refer to each Control in turn -- always the most recently merged + # control. Each merged-in Control instance corresponds to a single + # list item: see ListControl.__doc__. + if attrs: + self._attrs_list = [self._attrs] # extended by .merge_control() + self._disabled_list = [self._attrs.has_key("disabled")] # ditto + else: + self._attrs_list = [] # extended by .merge_control() + self._disabled_list = [] # ditto + + self._select_default = select_default + self._clicked = False + # Some list controls can have their default set only after all items + # are known. If so, self._value_is_set is false, and the self.fixup + # method, called after all items have been added, sets the default. + self._value_is_set = False + + def _value_from_label(self, label): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + + def toggle(self, name, by_label=False): + return self._set_selected_state(name, 2, by_label) + def set(self, selected, name, by_label=False): + action = int(bool(selected)) + return self._set_selected_state(name, action, by_label) + + def _set_selected_state(self, name, action, by_label): + """ + name: item name + action: + 0: clear + 1: set + 2: toggle + + """ + if not isstringlike(name): + raise TypeError("item name must be string-like") + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError("no item named '%s'" % name) + + if self.multiple: + if action == 2: + action = not self._selected[i] + if action and self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % name) + self._selected[i] = bool(action) + else: + if action == 2: + if self._selected == name: + action = 0 + else: + action = 1 + if action == 0 and self._selected == name: + self._selected = None + elif action == 1: + if self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % name) + self._selected = name + + def toggle_single(self, by_label=False): + self._set_single_selected_state(2, by_label) + def set_single(self, selected, by_label=False): + action = int(bool(selected)) + self._set_single_selected_state(action, by_label) + + def _set_single_selected_state(self, action, by_label): + if len(self._menu) != 1: + raise ItemCountError("'%s' is not a single-item control" % + self.name) + + name = self._menu[0] + if by_label: + name = self._value_from_label(name) + self._set_selected_state(name, action, by_label) + + def get_item_disabled(self, name, by_label=False): + """Get disabled state of named list item in a ListControl.""" + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + else: + return self._disabled_list[i] + + def set_item_disabled(self, disabled, name, by_label=False): + """Set disabled state of named list item in a ListControl. + + disabled: boolean disabled state + + """ + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + else: + self._disabled_list[i] = bool(disabled) + + def set_all_items_disabled(self, disabled): + """Set disabled state of all list items in a ListControl. + + disabled: boolean disabled state + + """ + for i in range(len(self._disabled_list)): + self._disabled_list[i] = bool(disabled) + + def get_item_attrs(self, name, by_label=False): + """Return dictionary of HTML attributes for a single ListControl item. + + The HTML element types that describe list items are: OPTION for SELECT + controls, INPUT for the rest. These elements have HTML attributes that + you may occasionally want to know about -- for example, the "alt" HTML + attribute gives a text string describing the item (graphical browsers + usually display this as a tooltip). + + The returned dictionary maps HTML attribute names to values. The names + and values are taken from the original HTML. + + Note that for SELECT controls, the returned dictionary contains a + special key "contents" -- see SelectControl.__doc__. + + """ + if by_label: + name = self._value_from_label(name) + try: + i = self._menu.index(name) + except ValueError: + raise ItemNotFoundError() + return self._attrs_list[i] + + def add_to_form(self, form): + try: + control = form.find_control(self.name, self.type) + except ControlNotFoundError: + Control.add_to_form(self, form) + else: + control.merge_control(self) + + def merge_control(self, control): + assert bool(control.multiple) == bool(self.multiple) + assert isinstance(control, self.__class__) + self._menu.extend(control._menu) + self._attrs_list.extend(control._attrs_list) + self._disabled_list.extend(control._disabled_list) + if control.multiple: + self._selected.extend(control._selected) + else: + if control._value_is_set: + self._selected = control._selected + if control._value_is_set: + self._value_is_set = True + + def fixup(self): + """ + ListControls are built up from component list items (which are also + ListControls) during parsing. This method should be called after all + items have been added. See ListControl.__doc__ for the reason this is + required. + + """ + # Need to set default selection where no item was indicated as being + # selected by the HTML: + + # CHECKBOX: + # Nothing should be selected. + # SELECT/single, SELECT/multiple and RADIO: + # RFC 1866 (HTML 2.0): says first item should be selected. + # W3C HTML 4.01 Specification: says that client behaviour is + # undefined in this case. For RADIO, exactly one must be selected, + # though which one is undefined. + # Both Netscape and Microsoft Internet Explorer (IE) choose first + # item for SELECT/single. However, both IE5 and Mozilla (both 1.0 + # and Firebird 0.6) leave all items unselected for RADIO and + # SELECT/multiple. + + # Since both Netscape and IE all choose the first item for + # SELECT/single, we do the same. OTOH, both Netscape and IE + # leave SELECT/multiple with nothing selected, in violation of RFC 1866 + # (but not in violation of the W3C HTML 4 standard); the same is true + # of RADIO (which *is* in violation of the HTML 4 standard). We follow + # RFC 1866 if the select_default attribute is set, and Netscape and IE + # otherwise. RFC 1866 and HTML 4 are always violated insofar as you + # can deselect all items in a RadioControl. + + raise NotImplementedError() + + def __getattr__(self, name): + if name == "value": + menu = self._menu + if self.multiple: + values = [] + for i in range(len(menu)): + if self._selected[i]: values.append(menu[i]) + return values + else: + if self._selected is None: return [] + else: return [self._selected] + else: + raise AttributeError("%s instance has no attribute '%s'" % + (self.__class__.__name__, name)) + + def __setattr__(self, name, value): + if name == "value": + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + self._set_value(value) + elif name in ("name", "type", "multiple"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def _set_value(self, value): + if self.multiple: + self._multiple_set_value(value) + else: + self._single_set_value(value) + + def _single_set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + nr = len(value) + if not (0 <= nr <= 1): + raise ItemCountError("single selection list, must set sequence of " + "length 0 or 1") + + if nr == 0: + self._selected = None + else: + value = value[0] + try: + i = self._menu.index(value) + except ValueError: + raise ItemNotFoundError("no item named '%s'" % + repr(value)) + if self._disabled_list[i]: + raise AttributeError("item '%s' is disabled" % value) + self._selected = value + + def _multiple_set_value(self, value): + if value is None or isstringlike(value): + raise TypeError("ListControl, must set a sequence") + + selected = [False]*len(self._selected) + menu = self._menu + disabled_list = self._disabled_list + + for v in value: + found = False + for i in range(len(menu)): + item_name = menu[i] + if v == item_name: + if disabled_list[i]: + raise AttributeError("item '%s' is disabled" % value) + selected[i] = True + found = True + break + if not found: + raise ItemNotFoundError("no item named '%s'" % repr(v)) + self._selected = selected + + def set_value_by_label(self, value): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + def get_value_by_label(self): + raise NotImplementedError("control '%s' does not yet support " + "by_label" % self.name) + + def possible_items(self, by_label=False): + if by_label: + raise NotImplementedError( + "control '%s' does not yet support by_label" % self.name) + return copy.copy(self._menu) + + def pairs(self): + if self.disabled: + return [] + + if not self.multiple: + name = self.name + value = self._selected + if name is None or value is None: + return [] + return [(name, value)] + else: + control_name = self.name # usually the name HTML attribute + pairs = [] + for i in range(len(self._menu)): + item_name = self._menu[i] # usually the value HTML attribute + if self._selected[i]: + pairs.append((control_name, item_name)) + return pairs + + def _item_str(self, i): + item_name = self._menu[i] + if self.multiple: + if self._selected[i]: + item_name = "*"+item_name + else: + if self._selected == item_name: + item_name = "*"+item_name + if self._disabled_list[i]: + item_name = "(%s)" % item_name + return item_name + + def __str__(self): + name = self.name + if name is None: name = "<None>" + + display = [] + for i in range(len(self._menu)): + s = self._item_str(i) + display.append(s) + + infos = [] + if self.disabled: infos.append("disabled") + if self.readonly: infos.append("readonly") + info = string.join(infos, ", ") + if info: info = " (%s)" % info + + return "<%s(%s=[%s])%s>" % (self.__class__.__name__, + name, string.join(display, ", "), info) + + +class RadioControl(ListControl): + """ + Covers: + + INPUT/RADIO + + """ + def __init__(self, type, name, attrs, select_default=False): + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + self.__dict__["multiple"] = False + value = attrs.get("value", "on") + self._menu = [value] + checked = attrs.has_key("checked") + if checked: + self._value_is_set = True + self._selected = value + else: + self._selected = None + + def fixup(self): + if not self._value_is_set: + # no item explicitly selected + assert self._selected is None + if self._select_default: + self._selected = self._menu[0] + self._value_is_set = True + + +class CheckboxControl(ListControl): + """ + Covers: + + INPUT/CHECKBOX + + """ + def __init__(self, type, name, attrs, select_default=False): + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + self.__dict__["multiple"] = True + value = attrs.get("value", "on") + self._menu = [value] + checked = attrs.has_key("checked") + self._selected = [checked] + self._value_is_set = True + + def fixup(self): + # If no items were explicitly checked in HTML, that's how we must + # leave it, so we have nothing to do here. + assert self._value_is_set + + +class SelectControl(ListControl): + """ + Covers: + + SELECT (and OPTION) + + SELECT control values and labels are subject to some messy defaulting + rules. For example, if the HTML repreentation of the control is: + + <SELECT name=year> + <OPTION value=0 label="2002">current year</OPTION> + <OPTION value=1>2001</OPTION> + <OPTION>2000</OPTION> + </SELECT> + + The items, in order, have labels "2002", "2001" and "2000", whereas their + values are "0", "1" and "2000" respectively. Note that the value of the + last OPTION in this example defaults to its contents, as specified by RFC + 1866, as do the labels of the second and third OPTIONs. + + The purpose of these methods is that the OPTION labels are sometimes much + more meaningful, than are the OPTION values, which can make for more + maintainable code. + + Additional read-only public attribute: attrs + + The attrs attribute is a dictionary of the original HTML attributes of the + SELECT element. Other ListControls do not have this attribute, because in + other cases the control as a whole does not correspond to any single HTML + element. The get_item_attrs method may be used as usual to get at the + HTML attributes of the HTML elements corresponding to individual list items + (for SELECT controls, these are OPTION elements). + + Another special case is that the attributes dictionaries returned by + get_item_attrs have a special key "contents" which does not correspond to + any real HTML attribute, but rather contains the contents of the OPTION + element: + + <OPTION>this bit</OPTION> + + """ + # HTML attributes here are treated slightly from other list controls: + # -The SELECT HTML attributes dictionary is stuffed into the OPTION + # HTML attributes dictionary under the "__select" key. + # -The content of each OPTION element is stored under the special + # "contents" key of the dictionary. + # After all this, the dictionary is passed to the SelectControl constructor + # as the attrs argument, as usual. However: + # -The first SelectControl constructed when building up a SELECT control + # has a constructor attrs argument containing only the __select key -- so + # this SelectControl represents an empty SELECT control. + # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and + # the __select dictionary containing the SELECT HTML-attributes. + def __init__(self, type, name, attrs, select_default=False): + # fish out the SELECT HTML attributes from the OPTION HTML attributes + # dictionary + self.attrs = attrs["__select"].copy() + attrs = attrs.copy() + del attrs["__select"] + + ListControl.__init__(self, type, name, attrs, select_default, + called_as_base_class=True) + + self._label_map = None + self.disabled = self.attrs.has_key("disabled") + self.id = self.attrs.get("id") + + self._menu = [] + self._selected = [] + self._value_is_set = False + if self.attrs.has_key("multiple"): + self.__dict__["multiple"] = True + self._selected = [] + else: + self.__dict__["multiple"] = False + self._selected = None + + if attrs: # OPTION item data was provided + value = attrs["value"] + self._menu.append(value) + selected = attrs.has_key("selected") + if selected: + self._value_is_set = True + if self.attrs.has_key("multiple"): + self._selected.append(selected) + elif selected: + self._selected = value + + def _build_select_label_map(self): + """Return an ordered mapping of labels to values. + + For example, if the HTML repreentation of the control is as given in + SelectControl.__doc__, this function will return a mapping like: + + {"2002": "0", "2001": "1", "2000": "2000"} + + """ + alist = [] + for val in self._menu: + attrs = self.get_item_attrs(val) + alist.append((attrs["label"], val)) + return AList(alist) + + def _value_from_label(self, label): + try: + return self._label_map[label] + except KeyError: + raise ItemNotFoundError("no item has label '%s'" % label) + + def fixup(self): + if not self._value_is_set: + # No item explicitly selected. + if len(self._menu) > 0: + if self.multiple: + if self._select_default: + self._selected[0] = True + else: + assert self._selected is None + self._selected = self._menu[0] + self._value_is_set = True + self._label_map = self._build_select_label_map() + + def possible_items(self, by_label=False): + if not by_label: + return copy.copy(self._menu) + else: + self._label_map.set_inverted(True) + try: + r = map(lambda v, self=self: self._label_map[v], self._menu) + finally: + self._label_map.set_inverted(False) + return r + + def set_value_by_label(self, value): + if isstringlike(value): + raise TypeError("ListControl, must set a sequence, not a string") + if self.disabled: + raise AttributeError("control '%s' is disabled" % self.name) + if self.readonly: + raise AttributeError("control '%s' is readonly" % self.name) + + try: + value = map(lambda v, self=self: self._label_map[v], value) + except KeyError, e: + raise ItemNotFoundError("no item has label '%s'" % e.args[0]) + self._set_value(value) + + def get_value_by_label(self): + menu = self._menu + self._label_map.set_inverted(True) + try: + if self.multiple: + values = [] + for i in range(len(menu)): + if self._selected[i]: + values.append(self._label_map[menu[i]]) + return values + else: + return [self._label_map[self._selected]] + finally: + self._label_map.set_inverted(False) + + +#--------------------------------------------------- +class SubmitControl(ScalarControl): + """ + Covers: + + INPUT/SUBMIT + BUTTON/SUBMIT + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it + # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem + # to define this. + if self.value is None: self.value = "" + self.readonly = True + + def _click(self, form, coord, return_type): + self._clicked = coord + r = form._switch_click(return_type) + self._clicked = False + return r + + def pairs(self): + if not self._clicked: + return [] + return ScalarControl.pairs(self) + + +#--------------------------------------------------- +class ImageControl(SubmitControl): + """ + Covers: + + INPUT/IMAGE + + The value attribute of an ImageControl is always None. Coordinates are + specified using one of the HTMLForm.click* methods. + + """ + def __init__(self, type, name, attrs): + ScalarControl.__init__(self, type, name, attrs) + self.__dict__["value"] = None + + def __setattr__(self, name, value): + if name in ("value", "name", "type"): + raise AttributeError("%s attribute is readonly" % name) + else: + self.__dict__[name] = value + + def pairs(self): + clicked = self._clicked + if self.disabled or not clicked: + return [] + name = self.name + if name is None: return [] + return [("%s.x" % name, str(clicked[0])), + ("%s.y" % name, str(clicked[1]))] + + +# aliases, just to make str(control) and str(form) clearer +class PasswordControl(TextControl): pass +class HiddenControl(TextControl): pass +class TextareaControl(TextControl): pass +class SubmitButtonControl(SubmitControl): pass + + +def is_listcontrol(control): return isinstance(control, ListControl) + + +class HTMLForm: + """Represents a single HTML <form> ... </form> element. + + A form consists of a sequence of controls that usually have names, and + which can take on various values. The values of the various types of + controls represent variously: text, zero-, one- or many-of-many choices, + and files to be uploaded. + + Forms can be filled in with data to be returned to the server, and then + submitted, using the click method to generate a request object suitable for + passing to urllib2.urlopen (or the click_request_data or click_pairs + methods if you're not using urllib2). + + import ClientForm + forms = ClientForm.ParseFile(html, base_uri) + form = forms[0] + + form["query"] = "Python" + form.set("lots", "nr_results") + + response = urllib2.urlopen(form.click()) + + Usually, HTMLForm instances are not created directly. Instead, the + ParseFile or ParseResponse factory functions are used. If you do construct + HTMLForm objects yourself, however, note that an HTMLForm instance is only + properly initialised after the fixup method has been called (ParseFile and + ParseResponse do this for you). See ListControl.__doc__ for the reason + this is required. + + Indexing a form (form["control_name"]) returns the named Control's value + attribute. Assignment to a form index (form["control_name"] = something) + is equivalent to assignment to the named Control's value attribute. If you + need to be more specific than just supplying the control's name, use the + set_value and get_value methods. + + ListControl values are lists of item names. The list item's name is the + value of the corresponding HTML element's "value" attribute. + + Example: + + <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT> + <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT> + + defines a CHECKBOX control with name "cheeses" which has two items, named + "leicester" and "cheddar". + + Another example: + + <SELECT name="more_cheeses"> + <OPTION>1</OPTION> + <OPTION value="2" label="CHEDDAR">cheddar</OPTION> + </SELECT> + + defines a SELECT control with name "more_cheeses" which has two items, + named "1" and "2". + + To set, clear or toggle individual list items, use the set and toggle + methods. To set the whole value, do as for any other control:use indexing + or the set_/get_value methods. + + Example: + + # select *only* the item named "cheddar" + form["cheeses"] = ["cheddar"] + # select "cheddar", leave other items unaffected + form.set("cheddar", "cheeses") + + Some controls (RADIO and SELECT without the multiple attribute) can only + have zero or one items selected at a time. Some controls (CHECKBOX and + SELECT with the multiple attribute) can have multiple items selected at a + time. To set the whole value of a multiple-selection ListControl, assign a + sequence to a form index: + + form["cheeses"] = ["cheddar", "leicester"] + + To check whether a control has an item, or whether an item is selected, + respectively: + + "cheddar" in form.possible_items("cheeses") + "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses")) + + Note that some items may be disabled (see below). + + Note the following mistake: + + form[control_name] = control_value + assert form[control_name] == control_value # not necessarily true + + The reason for this is that form[control_name] always gives the list items + in the order they were listed in the HTML. + + List items (hence list values, too) can be referred to in terms of list + item labels rather than list item names. Currently, this is only possible + for SELECT controls (this is a bug). To use this feature, use the by_label + arguments to the various HTMLForm methods. Note that it is *item* names + (hence ListControl values also), not *control* names, that can be referred + to by label. + + The question of default values of OPTION contents, labels and values is + somewhat complicated: see SelectControl.__doc__ and + ListControl.get_item_attrs.__doc__ if you think you need to know. + + Controls can be disabled or readonly. In either case, the control's value + cannot be changed until you clear those flags (using the methods on + HTMLForm). Disabled is the state typically represented by browsers by + `greying out' a control. Disabled controls are not `successful' -- they + don't cause data to get returned to the server. Readonly controls usually + appear in browsers as read-only text boxes. Readonly controls are + successful. List items can also be disabled. Attempts to select disabled + items (with form[name] = value, or using the ListControl.set method, for + example) fail. Attempts to clear disabled items are allowed. + + If a lot of controls are readonly, it can be useful to do this: + + form.set_all_readonly(False) + + When you want to do several things with a single control, or want to do + less common things, like changing which controls and items are disabled, + you can get at a particular control: + + control = form.find_control("cheeses") + control.set_item_disabled(False, "gruyere") + control.set("gruyere") + + Most methods on HTMLForm just delegate to the contained controls, so see + the docstrings of the various Control classes for further documentation. + Most of these delegating methods take name, type, kind, id and nr arguments + to specify the control to be operated on: see + HTMLForm.find_control.__doc__. + + ControlNotFoundError (subclass of ValueError) is raised if the specified + control can't be found. This includes occasions where a non-ListControl + is found, but the method (set, for example) requires a ListControl. + ItemNotFoundError (subclass of ValueError) is raised if a list item can't + be found. ItemCountError (subclass of ValueError) is raised if an attempt + is made to select more than one item and the control doesn't allow that, or + set/get_single are called and the control contains more than one item. + AttributeError is raised if a control or item is readonly or disabled and + an attempt is made to alter its value. + + XXX CheckBoxControl and RadioControl don't yet support item access by label + + Security note: Remember that any passwords you store in HTMLForm instances + will be saved to disk in the clear if you pickle them (directly or + indirectly). The simplest solution to this is to avoid pickling HTMLForm + objects. You could also pickle before filling in any password, or just set + the password to "" before pickling. + + + Public attributes: + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form (None if no name was specified) + attrs: dictionary mapping original HTML form attributes to their values + + controls: list of Control instances; do not alter this list + (instead, call form.new_control to make a Control and add it to the + form, or control.add_to_form if you already have a Control instance) + + + + Methods for form filling: + ------------------------- + + Most of the these methods have very similar arguments. See + HTMLForm.find_control.__doc__ for details of the name, type, kind and nr + arguments. See above for a description of by_label. + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None) + + get_value(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + set_value(value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + set_all_readonly(readonly) + + + Methods applying only to ListControls: + + possible_items(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + set(selected, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + toggle(item_name, + name=None, type=None, id=None, nr=None, + by_label=False) + + set_single(selected, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + toggle_single(name=None, type=None, kind=None, id=None, nr=None, + by_label=False) + + + Method applying only to FileControls: + + add_file(file_object, + content_type="application/octet-stream", filename=None, + name=None, id=None, nr=None) + + + Methods applying only to clickable controls: + + click(name=None, type=None, id=None, nr=0, coord=(1,1)) + click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1)) + click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1)) + + """ + + type2class = { + "text": TextControl, + "password": PasswordControl, + "hidden": HiddenControl, + "textarea": TextareaControl, + + "isindex": IsindexControl, + + "file": FileControl, + + "button": IgnoreControl, + "buttonbutton": IgnoreControl, + "reset": IgnoreControl, + "resetbutton": IgnoreControl, + + "submit": SubmitControl, + "submitbutton": SubmitButtonControl, + "image": ImageControl, + + "radio": RadioControl, + "checkbox": CheckboxControl, + "select": SelectControl, + } + +#--------------------------------------------------- +# Initialisation. Use ParseResponse / ParseFile instead. + + def __init__(self, action, method="GET", + enctype="application/x-www-form-urlencoded", + name=None, attrs=None): + """ + In the usual case, use ParseResponse (or ParseFile) to create new + HTMLForm objects. + + action: full (absolute URI) form action + method: "GET" or "POST" + enctype: form transfer encoding MIME type + name: name of form + attrs: dictionary mapping original HTML form attributes to their values + + """ + self.action = action + self.method = method + self.enctype = enctype + self.name = name + if attrs is not None: + self.attrs = attrs.copy() + else: + self.attrs = {} + self.controls = [] + + def new_control(self, type, name, attrs, + ignore_unknown=False, select_default=False): + """Adds a new control to the form. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + Note that controls representing lists of items are built up from + controls holding only a single list item. See ListControl.__doc__ for + further information. + + type: type of control (see Control.__doc__ for a list) + attrs: HTML attributes of control + ignore_unknown: if true, use a dummy Control instance for controls of + unknown type; otherwise, raise ValueError + select_default: for RADIO and multiple-selection SELECT controls, pick + the first item as the default if no 'selected' HTML attribute is + present (this defaulting happens when the HTMLForm.fixup method is + called) + + """ + type = string.lower(type) + klass = self.type2class.get(type) + if klass is None: + if ignore_unknown: + klass = IgnoreControl + else: + raise ValueError("Unknown control type '%s'" % type) + + a = attrs.copy() + if issubclass(klass, ListControl): + control = klass(type, name, a, select_default) + else: + control = klass(type, name, a) + control.add_to_form(self) + + def fixup(self): + """Normalise form after all controls have been added. + + This is usually called by ParseFile and ParseResponse. Don't call it + youself unless you're building your own Control instances. + + This method should only be called once, after all controls have been + added to the form. + + """ + for control in self.controls: + control.fixup() + +#--------------------------------------------------- + def __str__(self): + header = "%s %s %s" % (self.method, self.action, self.enctype) + rep = [header] + for control in self.controls: + rep.append(" %s" % str(control)) + return "<%s>" % string.join(rep, "\n") + +#--------------------------------------------------- +# Form-filling methods. + + def __getitem__(self, name): + return self.find_control(name).value + def __setitem__(self, name, value): + control = self.find_control(name) + try: + control.value = value + except AttributeError, e: + raise ValueError(str(e)) + + def get_value(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Return value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] + + """ + c = self.find_control(name, type, kind, id, nr=nr) + if by_label: + try: + meth = c.get_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + return meth() + else: + return c.value + def set_value(self, value, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Set value of control. + + If only name and value arguments are supplied, equivalent to + + form[name] = value + + """ + c = self.find_control(name, type, kind, id, nr=nr) + if by_label: + try: + meth = c.set_value_by_label + except AttributeError: + raise NotImplementedError( + "control '%s' does not yet support by_label" % c.name) + else: + meth(value) + else: + c.value = value + + def set_all_readonly(self, readonly): + for control in self.controls: + control.readonly = bool(readonly) + + +#--------------------------------------------------- +# Form-filling methods applying only to ListControls. + + def possible_items(self, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Return a list of all values that the specified control can take.""" + c = self._find_list_control(name, type, kind, id, nr) + return c.possible_items(by_label) + + def set(self, selected, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Select / deselect named list item. + + selected: boolean selected state + + """ + self._find_list_control(name, type, kind, id, nr).set( + selected, item_name, by_label) + def toggle(self, item_name, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Toggle selected state of named list item.""" + self._find_list_control(name, type, kind, id, nr).toggle( + item_name, by_label) + + def set_single(self, selected, + name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Select / deselect list item in a control having only one item. + + If the control has multiple list items, ItemCountError is raised. + + This is just a convenience method, so you don't need to know the item's + name -- the item name in these single-item controls is usually + something meaningless like "1" or "on". + + For example, if a checkbox has a single item named "on", the following + two calls are equivalent: + + control.toggle("on") + control.toggle_single() + + """ + self._find_list_control(name, type, kind, id, nr).set_single( + selected, by_label) + def toggle_single(self, name=None, type=None, kind=None, id=None, nr=None, + by_label=False): + """Toggle selected state of list item in control having only one item. + + The rest is as for HTMLForm.set_single.__doc__. + + """ + self._find_list_control(name, type, kind, id, nr).toggle_single( + by_label) + +#--------------------------------------------------- +# Form-filling method applying only to FileControls. + + def add_file(self, file_object, content_type=None, filename=None, + name=None, id=None, nr=None): + """Add a file to be uploaded. + + file_object: file-like object (with read method) from which to read + data to upload + content_type: MIME content type of data to upload + filename: filename to pass to server + + If filename is None, no filename is sent to the server. + + If content_type is None, the content type is guessed based on the + filename and the data from read from the file object. + + XXX + At the moment, guessed content type is always application/octet-stream. + Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and + plain text. + + """ + self.find_control(name, "file", id=id, nr=nr).add_file( + file_object, content_type, filename) + +#--------------------------------------------------- +# Form submission methods, applying only to clickable controls. + + def click(self, name=None, type=None, id=None, nr=0, coord=(1,1)): + """Return request that would result from clicking on a control. + + The request object is a urllib2.Request instance, which you can pass to + urllib2.urlopen (or ClientCookie.urlopen). + + Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and + IMAGEs) can be clicked. + + Will click on the first clickable control, subject to the name, type + and nr arguments (as for find_control). If no name, type, id or number + is specified and there are no clickable controls, a request will be + returned for the form in its current, un-clicked, state. + + IndexError is raised if any of name, type, id or nr is specified but no + matching control is found. ValueError is raised if the HTMLForm has an + enctype attribute that is not recognised. + + You can optionally specify a coordinate to click at, which only makes a + difference if you clicked on an image. + + """ + return self._click(name, type, id, nr, coord, "request") + + def click_request_data(self, + name=None, type=None, id=None, nr=0, coord=(1,1)): + """As for click method, but return a tuple (url, data, headers). + + You can use this data to send a request to the server. This is useful + if you're using httplib or urllib rather than urllib2. Otherwise, use + the click method. + + # Untested. Have to subclass to add headers, I think -- so use urllib2 + # instead! + import urllib + url, data, hdrs = form.click_request_data() + r = urllib.urlopen(url, data) + + # Untested. I don't know of any reason to use httplib -- you can get + # just as much control with urllib2. + import httplib, urlparse + url, data, hdrs = form.click_request_data() + tup = urlparse(url) + host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:]) + conn = httplib.HTTPConnection(host) + if data: + httplib.request("POST", path, data, hdrs) + else: + httplib.request("GET", path, headers=hdrs) + r = conn.getresponse() + + """ + return self._click(name, type, id, nr, coord, "request_data") + + def click_pairs(self, name=None, type=None, id=None, nr=0, coord=(1,1)): + """As for click_request_data, but returns a list of (key, value) pairs. + + You can use this list as an argument to ClientForm.urlencode. This is + usually only useful if you're using httplib or urllib rather than + urllib2 or ClientCookie. It may also be useful if you want to manually + tweak the keys and/or values, but this should not be necessary. + Otherwise, use the click method. + + Note that this method is only useful for forms of MIME type + x-www-form-urlencoded. In particular, it does not return the + information required for file upload. If you need file upload and are + not using urllib2, use click_request_data. + + Also note that Python 2.0's urllib.urlencode is slightly broken: it + only accepts a mapping, not a sequence of pairs, as an argument. This + messes up any ordering in the argument. Use ClientForm.urlencode + instead. + + """ + return self._click(name, type, id, nr, coord, "pairs") + +#--------------------------------------------------- + + def find_control(self, + name=None, type=None, kind=None, id=None, predicate=None, + nr=None): + """Locate some specific control within the form. + + At least one of the name, type, kind, predicate and nr arguments must + be supplied. If no matching control is found, ControlNotFoundError is + raised. + + If name is specified, then the control must have the indicated name. + + If type is specified then the control must have the specified type (in + addition to the types possible for <input> HTML tags: "text", + "password", "hidden", "submit", "image", "button", "radio", "checkbox", + "file" we also have "reset", "buttonbutton", "submitbutton", + "resetbutton", "textarea", "select" and "isindex"). + + If kind is specified, then the control must fall into the specified + group, each of which satisfies a particular interface. The types are + "text", "list", "multilist", "singlelist", "clickable" and "file". + + If id is specified, then the control must have the indicated id. + + If predicate is specified, then the control must match that function. + The predicate function is passed the control as its single argument, + and should return a boolean value indicating whether the control + matched. + + nr, if supplied, is the sequence number of the control (where 0 is the + first). Note that control 0 is the first control matching all the + other arguments (if supplied); it is not necessarily the first control + in the form. + + """ + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (predicate is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + if nr is None: nr = 0 + + return self._find_control(name, type, kind, id, predicate, nr) + +#--------------------------------------------------- +# Private methods. + + def _find_list_control(self, + name=None, type=None, kind=None, id=None, nr=None): + if ((name is None) and (type is None) and (kind is None) and + (id is None) and (nr is None)): + raise ValueError( + "at least one argument must be supplied to specify control") + if nr is None: nr = 0 + + return self._find_control(name, type, kind, id, is_listcontrol, nr) + + def _find_control(self, name, type, kind, id, predicate, nr): + if (name is not None) and not isstringlike(name): + raise TypeError("control name must be string-like") + if (type is not None) and not isstringlike(type): + raise TypeError("control type must be string-like") + if (kind is not None) and not isstringlike(kind): + raise TypeError("control kind must be string-like") + if (id is not None) and not isstringlike(id): + raise TypeError("control id must be string-like") + if (predicate is not None) and not callable(predicate): + raise TypeError("control predicate must be callable") + if nr < 0: raise ValueError("control number must be a positive " + "integer") + + orig_nr = nr + + for control in self.controls: + if name is not None and name != control.name: + continue + if type is not None and type != control.type: + continue + if (kind is not None and + not self._is_control_in_kind(control, kind)): + continue + if id is not None and id != control.id: + continue + if predicate and not predicate(control): + continue + if nr: + nr = nr - 1 + continue + return control + + description = [] + if name is not None: description.append("name '%s'" % name) + if type is not None: description.append("type '%s'" % type) + if kind is not None: description.append("kind '%s'" % kind) + if id is not None: description.append("id '%s'" % id) + if predicate is not None: + description.append("matching predicate %s" % predicate) + if orig_nr: description.append("nr %d" % orig_nr) + description = string.join(description, ", ") + raise ControlNotFoundError("no control with "+description) + + def _is_control_in_kind(self, control, kind): + # XXX not OO + if kind == "list": + return isinstance(control, ListControl) + elif kind == "multilist": + return bool(isinstance(control, ListControl) and control.multiple) + elif kind == "singlelist": + return bool(isinstance(control, ListControl) and + not control.multiple) + elif kind == "file": + return isinstance(control, FileControl) + elif kind == "text": + return isinstance(control, TextControl) + elif kind == "clickable": + return (isinstance(control, SubmitControl) or + isinstance(control, IsindexControl)) + else: + raise ValueError("no such control kind '%s'" % kind) + + def _click(self, name, type, id, nr, coord, return_type): + try: + control = self._find_control(name, type, "clickable", id, None, nr) + except ControlNotFoundError: + if ((name is not None) or (type is not None) or (id is not None) or + (nr != 0)): + raise + # no clickable controls, but no control was explicitly requested, + # so return state without clicking any control + return self._switch_click(return_type) + else: + return control._click(self, coord, return_type) + + def _pairs(self): + """Return sequence of (key, value) pairs suitable for urlencoding.""" + pairs = [] + for control in self.controls: + pairs.extend(control.pairs()) + return pairs + + def _request_data(self): + """Return a tuple (url, data, headers).""" + method = string.upper(self.method) + if method == "GET": + if self.enctype != "application/x-www-form-urlencoded": + raise ValueError( + "unknown GET form encoding type '%s'" % self.enctype) + uri = "%s?%s" % (self.action, urlencode(self._pairs())) + return uri, None, [] + elif method == "POST": + if self.enctype == "application/x-www-form-urlencoded": + return (self.action, urlencode(self._pairs()), + [("Content-type", self.enctype)]) + elif self.enctype == "multipart/form-data": + data = StringIO() + http_hdrs = [] + mw = MimeWriter(data, http_hdrs) + f = mw.startmultipartbody("form-data", add_to_http_hdrs=True, + prefix=0) + for control in self.controls: + control._write_mime_data(mw) + mw.lastpart() + return self.action, data.getvalue(), http_hdrs + else: + raise ValueError( + "unknown POST form encoding type '%s'" % self.enctype) + else: + raise ValueError("Unknown method '%s'" % method) + + def _switch_click(self, return_type): + # This is called by HTMLForm and clickable Controls to hide switching + # on return_type. + # XXX + # not OO + # duplicated in IsindexControl._click + if return_type == "pairs": + return self._pairs() + elif return_type == "request_data": + return self._request_data() + else: + req_data = self._request_data() + req = urllib2.Request(req_data[0], req_data[1]) + for key, val in req_data[2]: + req.add_header(key, val) + return req |