mirror of
https://github.com/python/cpython.git
synced 2024-12-01 11:15:56 +01:00
47b49bf6dc
svn+ssh://pythondev@svn.python.org/python/trunk ........ r57771 | thomas.wouters | 2007-08-30 23:54:39 +0200 (Thu, 30 Aug 2007) | 5 lines Don't lie in __all__ attributes when SSL is not available: only add the SSL classes when they are actually created. ........ r57620 | walter.doerwald | 2007-08-28 18:38:26 +0200 (Tue, 28 Aug 2007) | 5 lines Fix title endtag in HTMLCalender.formatyearpage(). Fix documentation for HTMLCalender.formatyearpage() (there's no themonth parameter). This fixes issue1046. ........ r57622 | georg.brandl | 2007-08-28 20:54:44 +0200 (Tue, 28 Aug 2007) | 2 lines Add a crasher for the thread-unsafety of file objects. ........ r57626 | skip.montanaro | 2007-08-29 01:22:52 +0200 (Wed, 29 Aug 2007) | 1 line fixes 813986 ........ r57628 | walter.doerwald | 2007-08-29 01:35:33 +0200 (Wed, 29 Aug 2007) | 2 lines Fix test output. ........ r57631 | skip.montanaro | 2007-08-29 03:24:11 +0200 (Wed, 29 Aug 2007) | 2 lines Install pygettext (once the scriptsinstall target is working again). ........ r57633 | skip.montanaro | 2007-08-29 03:33:45 +0200 (Wed, 29 Aug 2007) | 2 lines Recent items. ........ r57650 | neal.norwitz | 2007-08-29 08:15:33 +0200 (Wed, 29 Aug 2007) | 1 line Add Bill as a developer ........ r57651 | facundo.batista | 2007-08-29 12:28:28 +0200 (Wed, 29 Aug 2007) | 5 lines Ignore test failures caused by 'resource temporarily unavailable' exceptions raised during FailingServerTestCase tests. [GSoC - Alan McIntyre] ........ r57680 | bill.janssen | 2007-08-30 00:35:05 +0200 (Thu, 30 Aug 2007) | 17 lines This contains a number of things: 1) Improve the documentation of the SSL module, with a fuller explanation of certificate usage, another reference, proper formatting of this and that. 2) Fix Windows bug in ssl.py, and general bug in sslsocket.close(). Remove some unused code from ssl.py. Allow accept() to be called on sslsocket sockets. 3) Use try-except-else in import of ssl in socket.py. Deprecate use of socket.ssl(). 4) Remove use of socket.ssl() in every library module, except for test_socket_ssl.py and test_ssl.py. ........ r57714 | georg.brandl | 2007-08-30 12:09:42 +0200 (Thu, 30 Aug 2007) | 2 lines Stronger urge to convert filenames to str before using them as argument to ZipFile.write(). ........ r57716 | georg.brandl | 2007-08-30 12:38:56 +0200 (Thu, 30 Aug 2007) | 2 lines Patch #1680959: add test suite for pipes module. ........ r57717 | georg.brandl | 2007-08-30 14:32:23 +0200 (Thu, 30 Aug 2007) | 3 lines * Skip test_pipes on non-POSIX. * Don't raise TestSkipped within a test function. ........ r57723 | mark.summerfield | 2007-08-30 17:03:03 +0200 (Thu, 30 Aug 2007) | 3 lines Added more cross-references. ........ r57726 | walter.doerwald | 2007-08-30 17:30:09 +0200 (Thu, 30 Aug 2007) | 2 lines Rewrap line. ........ r57727 | walter.doerwald | 2007-08-30 17:34:55 +0200 (Thu, 30 Aug 2007) | 2 lines Set startinpos before calling the error handler. ........ r57730 | bill.janssen | 2007-08-30 19:07:28 +0200 (Thu, 30 Aug 2007) | 3 lines Added docstrings to methods and functions. ........ r57743 | bill.janssen | 2007-08-30 20:08:06 +0200 (Thu, 30 Aug 2007) | 1 line added note on new ssl module and deprecation of socket.ssl ........ r57747 | martin.v.loewis | 2007-08-30 20:14:01 +0200 (Thu, 30 Aug 2007) | 1 line Fix popen usage. ........ r57748 | martin.v.loewis | 2007-08-30 20:15:22 +0200 (Thu, 30 Aug 2007) | 1 line Fix typo. ........ r57750 | martin.v.loewis | 2007-08-30 20:25:47 +0200 (Thu, 30 Aug 2007) | 1 line Bug #1746880: Correctly install DLLs into system32 folder on Win64. ........ r57760 | martin.v.loewis | 2007-08-30 21:04:09 +0200 (Thu, 30 Aug 2007) | 1 line Bug #1709599: Run test_1565150 only if the file system is NTFS. ........ r57762 | martin.v.loewis | 2007-08-30 22:10:57 +0200 (Thu, 30 Aug 2007) | 2 lines Bump autoconf minimum version to 2.61. ........ r57764 | lars.gustaebel | 2007-08-30 22:24:31 +0200 (Thu, 30 Aug 2007) | 2 lines Warn about possible risks when extracting untrusted archives. ........ r57769 | thomas.wouters | 2007-08-30 23:01:17 +0200 (Thu, 30 Aug 2007) | 7 lines Somewhat-preliminary slice-object and extended slicing support for ctypes. The exact behaviour of omitted and negative indices for the Pointer type may need a closer look (especially as it's subtly different from simple slices) but there's time yet before 2.6, and not enough before 3.0a1 :-) ........
295 lines
9.9 KiB
Python
295 lines
9.9 KiB
Python
""" robotparser.py
|
|
|
|
Copyright (C) 2000 Bastian Kleineidam
|
|
|
|
You can choose between two licenses when using this package:
|
|
1) GNU GPLv2
|
|
2) PSF license for Python 2.2
|
|
|
|
The robots.txt Exclusion Protocol is implemented as specified in
|
|
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
|
|
"""
|
|
import urlparse,urllib
|
|
|
|
__all__ = ["RobotFileParser"]
|
|
|
|
debug = 0
|
|
|
|
def _debug(msg):
|
|
if debug: print(msg)
|
|
|
|
|
|
class RobotFileParser:
|
|
""" This class provides a set of methods to read, parse and answer
|
|
questions about a single robots.txt file.
|
|
|
|
"""
|
|
|
|
def __init__(self, url=''):
|
|
self.entries = []
|
|
self.default_entry = None
|
|
self.disallow_all = False
|
|
self.allow_all = False
|
|
self.set_url(url)
|
|
self.last_checked = 0
|
|
|
|
def mtime(self):
|
|
"""Returns the time the robots.txt file was last fetched.
|
|
|
|
This is useful for long-running web spiders that need to
|
|
check for new robots.txt files periodically.
|
|
|
|
"""
|
|
return self.last_checked
|
|
|
|
def modified(self):
|
|
"""Sets the time the robots.txt file was last fetched to the
|
|
current time.
|
|
|
|
"""
|
|
import time
|
|
self.last_checked = time.time()
|
|
|
|
def set_url(self, url):
|
|
"""Sets the URL referring to a robots.txt file."""
|
|
self.url = url
|
|
self.host, self.path = urlparse.urlparse(url)[1:3]
|
|
|
|
def read(self):
|
|
"""Reads the robots.txt URL and feeds it to the parser."""
|
|
opener = URLopener()
|
|
f = opener.open(self.url)
|
|
lines = []
|
|
line = f.readline()
|
|
while line:
|
|
lines.append(line.strip())
|
|
line = f.readline()
|
|
self.errcode = opener.errcode
|
|
if self.errcode in (401, 403):
|
|
self.disallow_all = True
|
|
_debug("disallow all")
|
|
elif self.errcode >= 400:
|
|
self.allow_all = True
|
|
_debug("allow all")
|
|
elif self.errcode == 200 and lines:
|
|
_debug("parse lines")
|
|
self.parse(lines)
|
|
|
|
def _add_entry(self, entry):
|
|
if "*" in entry.useragents:
|
|
# the default entry is considered last
|
|
self.default_entry = entry
|
|
else:
|
|
self.entries.append(entry)
|
|
|
|
def parse(self, lines):
|
|
"""parse the input lines from a robots.txt file.
|
|
We allow that a user-agent: line is not preceded by
|
|
one or more blank lines."""
|
|
state = 0
|
|
linenumber = 0
|
|
entry = Entry()
|
|
|
|
for line in lines:
|
|
linenumber = linenumber + 1
|
|
if not line:
|
|
if state==1:
|
|
_debug("line %d: warning: you should insert"
|
|
" allow: or disallow: directives below any"
|
|
" user-agent: line" % linenumber)
|
|
entry = Entry()
|
|
state = 0
|
|
elif state==2:
|
|
self._add_entry(entry)
|
|
entry = Entry()
|
|
state = 0
|
|
# remove optional comment and strip line
|
|
i = line.find('#')
|
|
if i>=0:
|
|
line = line[:i]
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
line = line.split(':', 1)
|
|
if len(line) == 2:
|
|
line[0] = line[0].strip().lower()
|
|
line[1] = urllib.unquote(line[1].strip())
|
|
if line[0] == "user-agent":
|
|
if state==2:
|
|
_debug("line %d: warning: you should insert a blank"
|
|
" line before any user-agent"
|
|
" directive" % linenumber)
|
|
self._add_entry(entry)
|
|
entry = Entry()
|
|
entry.useragents.append(line[1])
|
|
state = 1
|
|
elif line[0] == "disallow":
|
|
if state==0:
|
|
_debug("line %d: error: you must insert a user-agent:"
|
|
" directive before this line" % linenumber)
|
|
else:
|
|
entry.rulelines.append(RuleLine(line[1], False))
|
|
state = 2
|
|
elif line[0] == "allow":
|
|
if state==0:
|
|
_debug("line %d: error: you must insert a user-agent:"
|
|
" directive before this line" % linenumber)
|
|
else:
|
|
entry.rulelines.append(RuleLine(line[1], True))
|
|
else:
|
|
_debug("line %d: warning: unknown key %s" % (linenumber,
|
|
line[0]))
|
|
else:
|
|
_debug("line %d: error: malformed line %s"%(linenumber, line))
|
|
if state==2:
|
|
self.entries.append(entry)
|
|
_debug("Parsed rules:\n%s" % str(self))
|
|
|
|
|
|
def can_fetch(self, useragent, url):
|
|
"""using the parsed robots.txt decide if useragent can fetch url"""
|
|
_debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
|
|
(useragent, url))
|
|
if self.disallow_all:
|
|
return False
|
|
if self.allow_all:
|
|
return True
|
|
# search for given user agent matches
|
|
# the first match counts
|
|
url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
|
|
for entry in self.entries:
|
|
if entry.applies_to(useragent):
|
|
return entry.allowance(url)
|
|
# try the default entry last
|
|
if self.default_entry:
|
|
return self.default_entry.allowance(url)
|
|
# agent not found ==> access granted
|
|
return True
|
|
|
|
|
|
def __str__(self):
|
|
return ''.join([str(entry) + "\n" for entry in self.entries])
|
|
|
|
|
|
class RuleLine:
|
|
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
|
(allowance==False) followed by a path."""
|
|
def __init__(self, path, allowance):
|
|
if path == '' and not allowance:
|
|
# an empty value means allow all
|
|
allowance = True
|
|
self.path = urllib.quote(path)
|
|
self.allowance = allowance
|
|
|
|
def applies_to(self, filename):
|
|
return self.path=="*" or filename.startswith(self.path)
|
|
|
|
def __str__(self):
|
|
return (self.allowance and "Allow" or "Disallow")+": "+self.path
|
|
|
|
|
|
class Entry:
|
|
"""An entry has one or more user-agents and zero or more rulelines"""
|
|
def __init__(self):
|
|
self.useragents = []
|
|
self.rulelines = []
|
|
|
|
def __str__(self):
|
|
ret = []
|
|
for agent in self.useragents:
|
|
ret.extend(["User-agent: ", agent, "\n"])
|
|
for line in self.rulelines:
|
|
ret.extend([str(line), "\n"])
|
|
return ''.join(ret)
|
|
|
|
def applies_to(self, useragent):
|
|
"""check if this entry applies to the specified agent"""
|
|
# split the name token and make it lower case
|
|
useragent = useragent.split("/")[0].lower()
|
|
for agent in self.useragents:
|
|
if agent=='*':
|
|
# we have the catch-all agent
|
|
return True
|
|
agent = agent.lower()
|
|
if agent in useragent:
|
|
return True
|
|
return False
|
|
|
|
def allowance(self, filename):
|
|
"""Preconditions:
|
|
- our agent applies to this entry
|
|
- filename is URL decoded"""
|
|
for line in self.rulelines:
|
|
_debug((filename, str(line), line.allowance))
|
|
if line.applies_to(filename):
|
|
return line.allowance
|
|
return True
|
|
|
|
class URLopener(urllib.FancyURLopener):
|
|
def __init__(self, *args):
|
|
urllib.FancyURLopener.__init__(self, *args)
|
|
self.errcode = 200
|
|
|
|
def prompt_user_passwd(self, host, realm):
|
|
## If robots.txt file is accessible only with a password,
|
|
## we act as if the file wasn't there.
|
|
return None, None
|
|
|
|
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
|
self.errcode = errcode
|
|
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
|
|
errmsg, headers)
|
|
|
|
def _check(a,b):
|
|
if not b:
|
|
ac = "access denied"
|
|
else:
|
|
ac = "access allowed"
|
|
if a!=b:
|
|
print("failed")
|
|
else:
|
|
print("ok (%s)" % ac)
|
|
print()
|
|
|
|
def _test():
|
|
global debug
|
|
rp = RobotFileParser()
|
|
debug = 1
|
|
|
|
# robots.txt that exists, gotten to by redirection
|
|
rp.set_url('http://www.musi-cal.com/robots.txt')
|
|
rp.read()
|
|
|
|
# test for re.escape
|
|
_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
|
|
# this should match the first rule, which is a disallow
|
|
_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
|
|
# various cherry pickers
|
|
_check(rp.can_fetch('CherryPickerSE',
|
|
'http://www.musi-cal.com/cgi-bin/event-search'
|
|
'?city=San+Francisco'), 0)
|
|
_check(rp.can_fetch('CherryPickerSE/1.0',
|
|
'http://www.musi-cal.com/cgi-bin/event-search'
|
|
'?city=San+Francisco'), 0)
|
|
_check(rp.can_fetch('CherryPickerSE/1.5',
|
|
'http://www.musi-cal.com/cgi-bin/event-search'
|
|
'?city=San+Francisco'), 0)
|
|
# case sensitivity
|
|
_check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
|
|
_check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
|
|
# substring test
|
|
_check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
|
|
# tests for catch-all * agent
|
|
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
|
|
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
|
|
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
|
|
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
|
|
|
|
# robots.txt that does not exist
|
|
rp.set_url('http://www.lycos.com/robots.txt')
|
|
rp.read()
|
|
_check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
|
|
|
|
if __name__ == '__main__':
|
|
_test()
|