#coding=utf-8
from lxml import etree
html = '''
<html>
<body>
<a href="http://www.baidu.com/" target="_blank">91wan游戏</a>
<form action="/find.do">
<span class="s_ipt_wr">
<input type="text" name="wd" id="kw" maxlength="100" class="s_ipt" autocomplete="off">
</span>
<input type="hidden" name="rsv_bp" value="0">
<div>
<span>
<input type="hidden" name="ie" value="utf-8">
<div>
<input type="text" name="wd" id="kw" maxlength="100" class="s_ipt" autocomplete="off">
</div>
</span>
</div>
<div class="s_btn_wr">
<input type="submit" value="百度一下" id="su" class="s_btn" onmousedown="this.className='s_btn s_btn_h'" onmouseout="this.className='s_btn'">
</div>
<span class="s_btn_wr">
<input type="submit" value="百度一下" id="su" class="s_btn" onmousedown="this.className='s_btn s_btn_h'" onmouseout="this.className='s_btn'">
</span>
<div id="sd_1363580690288" style="display: none;"></div>
</form>
<form name="f" action="/search.do">
<span class="s_ipt_wr">
<input type="text" name="wd" id="kw" maxlength="100" class="s_ipt" autocomplete="off">
</span>
<input type="hidden" name="rsv_bp" value="0">
<div>
<span>
<input type="hidden" name="ie" value="utf-8">
<div>
<input type="text" name="wd" id="kw" maxlength="100" class="s_ipt" autocomplete="off">
</div>
</span>
</div>
</form>
</body>
</html>
'''
page = etree.HTML(html.lower().decode('utf-8'))
def getEle(page, xpath):
inputHiddenInForms = page.xpath(xpath)
print "size: " + `len(inputHiddenInForms)`
for input in inputHiddenInForms:
print input.attrib
print"text>>>>>>>>>"
getEle(page, u"//form//input[@type='text' or @type='hidden']");
上面的Python代码, 只能取到form下所有的input标签(text或hidden类型的), 但怎么把input外面的form也取出来呢? 要对应关系?
现在我用下面的方式拿到了, 不过感觉有些绕:
page = lxml.html.fromstring(html.lower().decode('utf-8'));
forms = page.forms
print "size: " + `len(forms)`
dir = {}
for f in forms:
actionName = f.action
# print actionName
formHTMlContent = tostring(f)
# print formHTMlContent
page = etree.HTML(formHTMlContent)
inputs = page.xpath(u"//form//input[@type='text' or @type='hidden']")
# dir.put(actionName, inputs)
# dir.update({actionName, inputs})
dir['"'+actionName+'"'] = inputs
#print str(dir)
for k, v in dir.iteritems():
print "action value: " + k
for input in v:
print "input " + str(input.attrib["name"])
大家看, 有没有更简单的方式?
å¦æä½ è¦ä¼ åä¸é¢çformçinputçè¯ï¼ä½ åºè¯¥submitä¸é¢çformï¼ä¸æ¯ä¸é¢çã
å¦æä½ è¦submitä¸é¢çformçæ¶åï¼ä¸æ¼æä¸é¢çformçèµæä¹ä¸èµ·ä¼ åï¼é£ä½ å¾ç¨javascript帮å¿ï¼ä¸æ¯ååé pythonå°±å¯ä»¥åå¾å°ãå¨submitä¸é¢çformçæ¶åï¼ä½ è¦å ç¨javascriptæä¸é¢çformçinput valueæåä¸ä»½å°ä¸é¢çformï¼ç¶å¾submitæ¶ï¼ææèµæé½ä¼é½å ¨è¿½é®
å¤è°¢å¦ã
ä¸è¿ï¼ è¿é边没æJavaScriptåæµè§å¨çä»ä¹äºï¼ å®å
¨æ¯ç¨Pythonè¯è¨è§£æäºã
å°±æ¯è¿ä¸ªé®é¢ï¼å°±æ¯å ä¸ºä½ åªç¨pythonææé®é¢
pythonæ¯server sideè¿ä½çï¼èjavascriptæ¯client sideè¿ä½çãæ¯ä¸æ¬¡submit formçæ¶åï¼å°±æ¯è¦ä»client sideä¼ åèµç»server sideãç°å¨ä½ éè¦çæ¯ä»client sideæ¶é好èµæå¾æsubmitå°server sideãä½ ååç¨pythonï¼å¨server sideé£è¾¹æ¯å¸®ä¸äºä»éº¼å¿