rutracker revision

- Now uses requests with more logging
- Update to latest BeautifulSoup and html5lib libs
This commit is contained in:
Ade
2015-08-02 12:18:25 +12:00
parent d90a31afc7
commit d2782179aa
28 changed files with 1268 additions and 933 deletions

223
headphones/rutracker.py Normal file
View File

@@ -0,0 +1,223 @@
#!/usr/bin/env python
import urllib
import requests as requests
from urlparse import urlparse
from bs4 import BeautifulSoup
import os
import re
import headphones
from headphones import logger
class Rutracker(object):
def __init__(self):
self.session = requests.session()
self.timeout = 60
self.loggedin = False
self.maxsize = 0
self.search_referer = 'http://rutracker.org/forum/tracker.php'
def logged_in(self):
return self.loggedin
def still_logged_in(self, html):
if not html or "action=\"http://login.rutracker.org/forum/login.php\">" in html:
return False
else:
return True
def login(self):
"""
Logs in user
"""
loginpage = 'http://login.rutracker.org/forum/login.php'
post_params = {
'login_username': headphones.CONFIG.RUTRACKER_USER,
'login_password': headphones.CONFIG.RUTRACKER_PASSWORD,
'login': b'\xc2\xf5\xee\xe4' # '%C2%F5%EE%E4'
}
logger.info("Attempting to log in to rutracker...")
# User agent doesn't seem to matter?
#self.headers['User-Agent'] = self.useragents[random.randrange(0, len(self.useragents))]
try:
r = self.session.post(loginpage, data=post_params, timeout=self.timeout)
if r.status_code != 200:
logger.error("rutracker login returned status code %s" % r.status_code)
self.loggedin = False
else:
if 'bb_data' in r.cookies.keys():
self.loggedin = True
logger.info("Successfully logged in to rutracker")
else:
logger.error("Could not login to rutracker, credentials maybe incorrect, " /
"site is down or too many attempts")
self.loggedin = False
return self.loggedin
except Exception as e:
logger.error("Unknown error logging in to rutracker: %s" % e)
self.loggedin = False
return self.loggedin
def searchurl(self, artist, album, year, format):
"""
Return the search url
"""
# Build search url
searchterm = ''
if artist != 'Various Artists':
searchterm = artist
searchterm = searchterm + ' '
searchterm = searchterm + album
searchterm = searchterm + ' '
searchterm = searchterm + year
if format == 'lossless':
format = '+lossless'
self.maxsize = 10000000000
elif format == 'lossless+mp3':
format = '+lossless||mp3||aac'
self.maxsize = 10000000000
else:
format = '+mp3||aac'
self.maxsize = 300000000
# sort by size, descending.
sort = '&o=7&s=2'
searchurl = "%s?nm=%s%s%s" % (self.search_referer, urllib.quote(searchterm), format, sort)
logger.info("Searching rutracker using term: %s", searchterm)
return searchurl
def search(self, searchurl):
"""
Parse the search results and return valid torrent list
"""
try:
headers = {'Referer': self.search_referer}
r = self.session.get(url=searchurl, headers=headers, timeout=self.timeout)
soup = BeautifulSoup(r.content, 'html5lib')
# Debug
#logger.debug (soup.prettify())
# Check if still logged in
if not self.still_logged_in(soup):
self.login()
r = self.session.get(url=searchurl, timeout=self.timeout)
soup = BeautifulSoup(r.content, 'html5lib')
if not self.still_logged_in(soup):
logger.error("Error getting rutracker data")
return None
# Process
rulist = []
i = soup.find('table', id='tor-tbl')
if not i:
logger.info("No valid results found from rutracker")
return None
minimumseeders = int(headphones.CONFIG.NUMBEROFSEEDERS) - 1
for item in zip(i.find_all(class_='hl-tags'),i.find_all(class_='dl-stub'),i.find_all(class_='seedmed')):
title = item[0].get_text()
url = item[1].get('href')
size_formatted = item[1].get_text()[:-2]
seeds = item[2].get_text()
size_parts = size_formatted.split()
size = float(size_parts[0])
if size_parts[1] == 'KB':
size *= 1024
if size_parts[1] == 'MB':
size *= 1024 ** 2
if size_parts[1] == 'GB':
size *= 1024 ** 3
if size_parts[1] == 'TB':
size *= 1024 ** 4
if size < self.maxsize and minimumseeders < int(seeds):
logger.info('Found %s. Size: %s' % (title, size_formatted))
#Torrent topic page
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
rulist.append((title, size, topicurl, 'rutracker.org', 'torrent', True))
else:
logger.info("%s is larger than the maxsize or has too little seeders for this category, " \
"skipping. (Size: %i bytes, Seeders: %i)" % (title, size, int(seeds)))
if not rulist:
logger.info("No valid results found from rutracker")
return rulist
except Exception as e:
logger.error("An unknown error occurred in the rutracker parser: %s" % e)
return None
def get_torrent_data(self, url):
"""
return the .torrent data
"""
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
cookie = {'bb_dl': torrent_id}
try:
headers = {'Referer': url}
r = self.session.get(url=downloadurl, cookies=cookie, headers=headers, timeout=self.timeout)
return r.content
except Exception as e:
logger.error('Error getting torrent: %s', e)
return False
#TODO get this working in utorrent.py
def utorrent_add_file(self, data):
host = headphones.CONFIG.UTORRENT_HOST
if not host.startswith('http'):
host = 'http://' + host
if host.endswith('/'):
host = host[:-1]
if host.endswith('/gui'):
host = host[:-4]
base_url = host
url = base_url + '/gui/'
self.session.auth = (headphones.CONFIG.UTORRENT_USERNAME, headphones.CONFIG.UTORRENT_PASSWORD)
try:
r = self.session.get(url + 'token.html')
except Exception as e:
logger.error('Error getting token: %s', e)
return
if r.status_code == 401:
logger.debug('Error reaching utorrent')
return
regex = re.search(r'.+>([^<]+)</div></html>', r.text)
if regex is None:
logger.debug('Error reading token')
return
self.session.params = {'token': regex.group(1)}
files = {'torrent_file': ("", data)}
try:
self.session.post(url, params={'action': 'add-file'}, files=files)
except Exception as e:
logger.exception('Error adding file to utorrent %s', e)

View File

@@ -36,12 +36,10 @@ import unicodedata
from headphones.common import USER_AGENT
from headphones import logger, db, helpers, classes, sab, nzbget, request
from headphones import utorrent, transmission, notifiers
from headphones import utorrent, transmission, notifiers, rutracker
from bencode import bencode, bdecode
import headphones.searcher_rutracker as rutrackersearch
# Magnet to torrent services, for Black hole. Stolen from CouchPotato.
TORRENT_TO_MAGNET_SERVICES = [
'https://zoink.it/torrent/%s.torrent',
@@ -51,9 +49,7 @@ TORRENT_TO_MAGNET_SERVICES = [
# Persistent What.cd API object
gazelle = None
# RUtracker search object
rutracker = rutrackersearch.Rutracker()
ruobj = None
def fix_url(s, charset="utf-8"):
@@ -818,15 +814,9 @@ def send_to_downloader(data, bestqual, album):
"to open or convert magnet links")
return
else:
if bestqual[3] == "rutracker.org":
download_path, _ = rutracker.get_torrent(bestqual[2],
headphones.CONFIG.TORRENTBLACKHOLE_DIR)
if not download_path:
return
else:
if not torrent_to_file(download_path, data):
return
if not torrent_to_file(download_path, data):
return
# Extract folder name from torrent
folder_name = read_torrent_name(download_path, bestqual[0])
@@ -836,13 +826,11 @@ def send_to_downloader(data, bestqual, album):
elif headphones.CONFIG.TORRENT_DOWNLOADER == 1:
logger.info("Sending torrent to Transmission")
# rutracker needs cookies to be set, pass the .torrent file instead of url
# Add torrent
if bestqual[3] == 'rutracker.org':
file_or_url, torrentid = rutracker.get_torrent(bestqual[2])
torrentid = transmission.addTorrent('', data)
else:
file_or_url = bestqual[2]
torrentid = transmission.addTorrent(file_or_url)
torrentid = transmission.addTorrent(bestqual[2])
if not torrentid:
logger.error("Error sending torrent to Transmission. Are you sure it's running?")
@@ -855,13 +843,6 @@ def send_to_downloader(data, bestqual, album):
logger.error('Torrent folder name could not be determined')
return
# remove temp .torrent file created above
if bestqual[3] == 'rutracker.org':
try:
shutil.rmtree(os.path.split(file_or_url)[0])
except Exception as e:
logger.exception("Unhandled exception")
# Set Seed Ratio
seed_ratio = get_seed_ratio(bestqual[3])
if seed_ratio is not None:
@@ -870,30 +851,30 @@ def send_to_downloader(data, bestqual, album):
else:# if headphones.CONFIG.TORRENT_DOWNLOADER == 2:
logger.info("Sending torrent to uTorrent")
# rutracker needs cookies to be set, pass the .torrent file instead of url
# Add torrent
if bestqual[3] == 'rutracker.org':
file_or_url, torrentid = rutracker.get_torrent(bestqual[2])
folder_name, cacheid = utorrent.dirTorrent(torrentid)
folder_name = os.path.basename(os.path.normpath(folder_name))
utorrent.labelTorrent(torrentid)
ruobj.utorrent_add_file(data)
else:
file_or_url = bestqual[2]
torrentid = calculate_torrent_hash(file_or_url, data)
folder_name = utorrent.addTorrent(file_or_url, torrentid)
utorrent.addTorrent(bestqual[2])
# Get hash
torrentid = calculate_torrent_hash(bestqual[2], data)
if not torrentid:
logger.error('Torrent id could not be determined')
return
# Set Label
if headphones.CONFIG.UTORRENT_LABEL:
utorrent.labelTorrent(torrentid)
# Get folder
folder_name = utorrent.getFolder(torrentid)
if folder_name:
logger.info('Torrent folder name: %s' % folder_name)
else:
logger.error('Torrent folder name could not be determined')
return
# remove temp .torrent file created above
if bestqual[3] == 'rutracker.org':
try:
shutil.rmtree(os.path.split(file_or_url)[0])
except Exception as e:
logger.exception("Unhandled exception")
# Set Seed Ratio
seed_ratio = get_seed_ratio(bestqual[3])
if seed_ratio is not None:
@@ -1041,12 +1022,7 @@ def verifyresult(title, artistterm, term, lossless):
def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose_specific_download=False):
global gazelle # persistent what.cd api object to reduce number of login attempts
# rutracker login
if headphones.CONFIG.RUTRACKER and album:
rulogin = rutracker.login(headphones.CONFIG.RUTRACKER_USER, headphones.CONFIG.RUTRACKER_PASSWORD)
if not rulogin:
logger.info(u'Could not login to rutracker, search results will exclude this provider')
global ruobj # and rutracker
albumid = album['AlbumID']
reldate = album['ReleaseDate']
@@ -1239,45 +1215,38 @@ def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose
logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s", e)
# rutracker.org
if headphones.CONFIG.RUTRACKER and rulogin:
if headphones.CONFIG.RUTRACKER:
provider = "rutracker.org"
# Ignore if release date not specified, results too unpredictable
if not year and not usersearchterm:
logger.info(u'Release date not specified, ignoring for rutracker.org')
logger.info(u"Release date not specified, ignoring for rutracker.org")
else:
if headphones.CONFIG.PREFERRED_QUALITY == 3 or losslessOnly:
format = 'lossless'
maxsize = 10000000000
elif headphones.CONFIG.PREFERRED_QUALITY == 1 or allow_lossless:
format = 'lossless+mp3'
maxsize = 10000000000
else:
format = 'mp3'
maxsize = 300000000
# build search url based on above
if not usersearchterm:
searchURL = rutracker.searchurl(artistterm, albumterm, year, format)
else:
searchURL = rutracker.searchurl(usersearchterm, ' ', ' ', format)
# Login
if not ruobj or not ruobj.logged_in():
ruobj = rutracker.Rutracker()
if not ruobj.login():
ruobj = None
logger.info(u'Parsing results from <a href="%s">rutracker.org</a>' % searchURL)
if ruobj and ruobj.logged_in():
# parse results and get best match
rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid)
# build search url
if not usersearchterm:
searchURL = ruobj.searchurl(artistterm, albumterm, year, format)
else:
searchURL = ruobj.searchurl(usersearchterm, ' ', ' ', format)
# add best match to overall results list
if rulist:
for ru in rulist:
title = ru[0].decode('utf-8')
size = ru[1]
url = ru[2]
resultlist.append((title, size, url, provider, 'torrent', True))
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
else:
logger.info(u"No valid results found from %s" % (provider))
# parse results
rulist = ruobj.search(searchURL)
if rulist:
resultlist.extend(rulist)
if headphones.CONFIG.WHATCD:
provider = "What.cd"
@@ -1567,12 +1536,14 @@ def preprocess(resultlist):
for result in resultlist:
if result[4] == 'torrent':
# rutracker always needs the torrent data
if result[3] == 'rutracker.org':
return ruobj.get_torrent_data(result[2]), result
#Get out of here if we're using Transmission
if headphones.CONFIG.TORRENT_DOWNLOADER == 1: ## if not a magnet link still need the .torrent to generate hash... uTorrent support labeling
return True, result
# get outta here if rutracker
if result[3] == 'rutracker.org':
return True, result
# Get out of here if it's a magnet link
if result[2].lower().startswith("magnet:"):
return True, result

View File

@@ -1,349 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Headphones rutracker.org search
# Functions called from searcher.py
from bencode import bencode as bencode, bdecode
from urlparse import urlparse
from bs4 import BeautifulSoup
from tempfile import mkdtemp
from hashlib import sha1
import headphones
import requests
import cookielib
import urllib2
import urllib
import re
import os
from headphones import db, logger
class Rutracker():
logged_in = False
# Stores a number of login attempts to prevent recursion.
#login_counter = 0
def __init__(self):
self.cookiejar = cookielib.CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar))
urllib2.install_opener(self.opener)
def login(self, login, password):
"""Implements tracker login procedure."""
self.logged_in = False
if login is None or password is None:
return False
#self.login_counter += 1
# No recursion wanted.
#if self.login_counter > 1:
# return False
params = urllib.urlencode({"login_username": login,
"login_password": password,
"login": "Вход"})
try:
self.opener.open("http://login.rutracker.org/forum/login.php", params)
except Exception:
pass
# Check if we're logged in
for cookie in self.cookiejar:
if cookie.name == 'bb_data':
self.logged_in = True
return self.logged_in
def searchurl(self, artist, album, year, format):
"""
Return the search url
"""
# Build search url
searchterm = ''
if artist != 'Various Artists':
searchterm = artist
searchterm = searchterm + ' '
searchterm = searchterm + album
searchterm = searchterm + ' '
searchterm = searchterm + year
providerurl = "http://rutracker.org/forum/tracker.php"
if format == 'lossless':
format = '+lossless'
elif format == 'lossless+mp3':
format = '+lossless||mp3||aac'
else:
format = '+mp3||aac'
# sort by size, descending.
sort = '&o=7&s=2'
searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort)
return searchurl
def search(self, searchurl, maxsize, minseeders, albumid):
"""
Parse the search results and return valid torrent list
"""
titles = []
urls = []
seeders = []
sizes = []
torrentlist = []
rulist = []
try:
page = self.opener.open(searchurl, timeout=60)
soup = BeautifulSoup(page.read())
# Debug
#logger.debug (soup.prettify())
# Title
for link in soup.find_all('a', attrs={'class': 'med tLink hl-tags bold'}):
title = link.get_text()
titles.append(title)
# Download URL
for link in soup.find_all('a', attrs={'class': 'small tr-dl dl-stub'}):
url = link.get('href')
urls.append(url)
# Seeders
for link in soup.find_all('b', attrs={'class': 'seedmed'}):
seeder = link.get_text()
seeders.append(seeder)
# Size
for link in soup.find_all('td', attrs={'class': 'row4 small nowrap tor-size'}):
size = link.u.string
sizes.append(size)
except:
pass
# Combine lists
torrentlist = zip(titles, urls, seeders, sizes)
# return if nothing found
if not torrentlist:
return False
# don't bother checking track counts anymore, let searcher filter instead
# leave code in just in case
check_track_count = False
if check_track_count:
# get headphones track count for album, return if not found
myDB = db.DBConnection()
tracks = myDB.select('SELECT * from tracks WHERE AlbumID=?', [albumid])
hptrackcount = len(tracks)
if not hptrackcount:
logger.info('headphones track info not found, cannot compare to torrent')
return False
# Return all valid entries, ignored, required words now checked in searcher.py
#unwantedlist = ['promo', 'vinyl', '[lp]', 'songbook', 'tvrip', 'hdtv', 'dvd']
formatlist = ['ape', 'flac', 'ogg', 'm4a', 'aac', 'mp3', 'wav', 'aif']
deluxelist = ['deluxe', 'edition', 'japanese', 'exclusive']
for torrent in torrentlist:
returntitle = torrent[0].encode('utf-8')
url = torrent[1]
seeders = torrent[2]
size = torrent[3]
if int(size) <= maxsize and int(seeders) >= minseeders:
#Torrent topic page
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
# add to list
if not check_track_count:
valid = True
else:
# Check torrent info
self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
# Debug
#for cookie in self.cookiejar:
# logger.debug ('Cookie: %s' % cookie)
try:
page = self.opener.open(url)
torrent = page.read()
if torrent:
decoded = bdecode(torrent)
metainfo = decoded['info']
page.close()
except Exception as e:
logger.error('Error getting torrent: %s' % e)
return False
# get torrent track count and check for cue
trackcount = 0
cuecount = 0
if 'files' in metainfo: # multi
for pathfile in metainfo['files']:
path = pathfile['path']
for file in path:
if any(file.lower().endswith('.' + x.lower()) for x in formatlist):
trackcount += 1
if '.cue' in file:
cuecount += 1
title = returntitle.lower()
logger.debug('torrent title: %s' % title)
logger.debug('headphones trackcount: %s' % hptrackcount)
logger.debug('rutracker trackcount: %s' % trackcount)
# If torrent track count less than headphones track count, and there's a cue, then attempt to get track count from log(s)
# This is for the case where we have a single .flac/.wav which can be split by cue
# Not great, but shouldn't be doing this too often
totallogcount = 0
if trackcount < hptrackcount and cuecount > 0 and cuecount < hptrackcount:
page = self.opener.open(topicurl, timeout=60)
soup = BeautifulSoup(page.read())
findtoc = soup.find_all(text='TOC of the extracted CD')
if not findtoc:
findtoc = soup.find_all(text='TOC извлечённого CD')
for toc in findtoc:
logcount = 0
for toccontent in toc.find_all_next(text=True):
cut_string = toccontent.split('|')
new_string = cut_string[0].lstrip().rstrip()
if new_string == '1' or new_string == '01':
logcount = 1
elif logcount > 0:
if new_string.isdigit():
logcount += 1
else:
break
totallogcount = totallogcount + logcount
if totallogcount > 0:
trackcount = totallogcount
logger.debug('rutracker logtrackcount: %s' % totallogcount)
# If torrent track count = hp track count then return torrent,
# if greater, check for deluxe/special/foreign editions
# if less, then allow if it's a single track with a cue
valid = False
if trackcount == hptrackcount:
valid = True
elif trackcount > hptrackcount:
if any(deluxe in title for deluxe in deluxelist):
valid = True
# Add to list
if valid:
rulist.append((returntitle, size, topicurl))
else:
if topicurl:
logger.info(u'<a href="%s">Torrent</a> found with %s tracks but the selected headphones release has %s tracks, skipping for rutracker.org' % (topicurl, trackcount, hptrackcount))
else:
logger.info('%s is larger than the maxsize or has too little seeders for this category, skipping. (Size: %i bytes, Seeders: %i)' % (returntitle, int(size), int(seeders)))
return rulist
def get_torrent(self, url, savelocation=None):
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
torrent_name = torrent_id + '.torrent'
try:
prev = os.umask(headphones.UMASK)
page = self.opener.open(downloadurl)
torrent = page.read()
decoded = bdecode(torrent)
metainfo = decoded['info']
tor_hash = sha1(bencode(metainfo)).hexdigest()
if savelocation:
download_path = os.path.join(savelocation, torrent_name)
else:
tempdir = mkdtemp(suffix='_rutracker_torrents')
download_path = os.path.join(tempdir, torrent_name)
with open(download_path, 'wb') as f:
f.write(torrent)
os.umask(prev)
# Add file to utorrent
if headphones.CONFIG.TORRENT_DOWNLOADER == 2:
self.utorrent_add_file(download_path)
except Exception as e:
logger.error('Error getting torrent: %s', e)
return False
return download_path, tor_hash
#TODO get this working in utorrent.py
def utorrent_add_file(self, filename):
host = headphones.CONFIG.UTORRENT_HOST
if not host.startswith('http'):
host = 'http://' + host
if host.endswith('/'):
host = host[:-1]
if host.endswith('/gui'):
host = host[:-4]
base_url = host
username = headphones.CONFIG.UTORRENT_USERNAME
password = headphones.CONFIG.UTORRENT_PASSWORD
session = requests.Session()
url = base_url + '/gui/'
session.auth = (username, password)
try:
r = session.get(url + 'token.html')
except Exception:
logger.exception('Error getting token')
return
if r.status_code == '401':
logger.debug('Error reaching utorrent')
return
regex = re.search(r'.+>([^<]+)</div></html>', r.text)
if regex is None:
logger.debug('Error reading token')
return
session.params = {'token': regex.group(1)}
with open(filename, 'rb') as f:
try:
session.post(url, params={'action': 'add-file'},
files={'torrent_file': f})
except Exception:
logger.exception('Error adding file to utorrent')
return

View File

@@ -28,12 +28,15 @@ import headphones
# Store torrent id so we can check up on it
def addTorrent(link):
def addTorrent(link, data=None):
method = 'torrent-add'
if link.endswith('.torrent'):
with open(link, 'rb') as f:
metainfo = str(base64.b64encode(f.read()))
if link.endswith('.torrent') or data:
if data:
metainfo = str(base64.b64encode(data))
else:
with open(link, 'rb') as f:
metainfo = str(base64.b64encode(f.read()))
arguments = {'metainfo': metainfo, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR}
else:
arguments = {'filename': link, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR}

View File

@@ -220,7 +220,7 @@ def dirTorrent(hash, cacheid=None, return_name=None):
cacheid = torrentList['torrentc']
for torrent in torrents:
if torrent[0].upper() == hash:
if torrent[0].upper() == hash.upper():
if not return_name:
return torrent[26], cacheid
else:
@@ -228,8 +228,12 @@ def dirTorrent(hash, cacheid=None, return_name=None):
return None, None
def addTorrent(link):
uTorrentClient = utorrentclient()
uTorrentClient.add_url(link)
def addTorrent(link, hash):
def getFolder(hash):
uTorrentClient = utorrentclient()
# Get Active Directory from settings
@@ -239,8 +243,6 @@ def addTorrent(link, hash):
logger.error('Could not get "Put new downloads in:" directory from uTorrent settings, please ensure it is set')
return None
uTorrentClient.add_url(link)
# Get Torrent Folder Name
torrent_folder, cacheid = dirTorrent(hash)
@@ -254,10 +256,8 @@ def addTorrent(link, hash):
if torrent_folder == active_dir or not torrent_folder:
torrent_folder, cacheid = dirTorrent(hash, cacheid, return_name=True)
labelTorrent(hash)
return torrent_folder
else:
labelTorrent(hash)
if headphones.SYS_PLATFORM != "win32":
torrent_folder = torrent_folder.replace('\\', '/')
return os.path.basename(os.path.normpath(torrent_folder))

View File

@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.3.2"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
__version__ = "4.4.0"
__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
@@ -45,7 +45,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
@@ -77,8 +77,11 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
@@ -114,9 +117,9 @@ class BeautifulSoup(Tag):
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. You can pass in features='html' "
"or features='xml' to get a builder capable of handling "
"one or the other.")
"BeautifulSoup constructor. Suggest you use "
"features='lxml' for HTML and features='lxml-xml' for "
"XML.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
@@ -140,6 +143,7 @@ class BeautifulSoup(Tag):
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
original_features = features
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
@@ -151,6 +155,16 @@ class BeautifulSoup(Tag):
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
if not (original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES):
if builder.is_xml:
markup_type = "XML"
else:
markup_type = "HTML"
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
parser=builder.NAME,
markup_type=markup_type))
self.builder = builder
self.is_xml = builder.is_xml
self.builder.soup = self
@@ -178,6 +192,8 @@ class BeautifulSoup(Tag):
# system. Just let it go.
pass
if is_file:
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
@@ -185,12 +201,15 @@ class BeautifulSoup(Tag):
# Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(markup, from_encoding)):
self.builder.prepare_markup(
markup, from_encoding, exclude_encodings=exclude_encodings)):
self.reset()
try:
self._feed()
@@ -203,6 +222,16 @@ class BeautifulSoup(Tag):
self.markup = None
self.builder.soup = None
def __copy__(self):
return type(self)(self.encode(), builder=self.builder)
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable:
del d['builder']
return d
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
@@ -229,9 +258,7 @@ class BeautifulSoup(Tag):
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
navigable = subclass(s)
navigable.setup()
return navigable
return subclass(s)
def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@@ -290,14 +317,49 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
parent = parent or self.currentTag
most_recent_element = most_recent_element or self._most_recent_element
o.setup(parent, most_recent_element)
previous_element = most_recent_element or self._most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
if not previous_element:
previous_element = o.previous_element
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
if most_recent_element is not None:
most_recent_element.next_element = o
self._most_recent_element = o
parent.contents.append(o)
if parent.next_sibling:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
index = parent.contents.index(o)
if index == 0:
previous_element = parent
previous_sibling = None
else:
previous_element = previous_sibling = parent.contents[index-1]
if index == len(parent.contents)-1:
next_element = parent.next_sibling
next_sibling = None
else:
next_element = next_sibling = parent.contents[index+1]
o.previous_element = previous_element
if previous_element:
previous_element.next_element = o
o.next_element = next_element
if next_element:
next_element.previous_element = o
o.next_sibling = next_sibling
if next_sibling:
next_sibling.previous_sibling = o
o.previous_sibling = previous_sibling
if previous_sibling:
previous_sibling.next_sibling = o
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag

View File

@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
features = []
is_xml = False
picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.

View File

@@ -2,6 +2,7 @@ __all__ = [
'HTML5TreeBuilder',
]
from pdb import set_trace
import warnings
from bs4.builder import (
PERMISSIVE,
@@ -9,7 +10,10 @@ from bs4.builder import (
HTML_5,
HTMLTreeBuilder,
)
from bs4.element import NamespacedAttribute
from bs4.element import (
NamespacedAttribute,
whitespace_re,
)
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
@@ -22,11 +26,20 @@ from bs4.element import (
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
NAME = "html5lib"
def prepare_markup(self, markup, user_specified_encoding):
features = [NAME, PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding,
document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
# document_declared_encoding and exclude_encodings aren't used
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
@@ -101,7 +114,13 @@ class AttrList(object):
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
"set attr", name, value
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
value = whitespace_re.split(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@@ -161,6 +180,12 @@ class Element(html5lib.treebuilders._base.Node):
# immediately after the parent, if it has no children.)
if self.element.contents:
most_recent_element = self.element._last_descendant(False)
elif self.element.next_element is not None:
# Something from further ahead in the parse tree is
# being inserted into this earlier element. This is
# very annoying because it means an expensive search
# for the last element in the tree.
most_recent_element = self.soup._last_descendant()
else:
most_recent_element = self.element
@@ -172,6 +197,7 @@ class Element(html5lib.treebuilders._base.Node):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0:
converted_attributes = []
@@ -218,6 +244,9 @@ class Element(html5lib.treebuilders._base.Node):
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -236,17 +265,28 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
append_after = new_parent.element.contents
append_after = new_parent_element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
first_child.previous_element = new_parents_last_descendant
if new_parents_last_descendant:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
# Fix the last child's next_element and next_sibling
last_child = to_append[-1]
last_child.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
new_parents_last_descendant_next_element.previous_element = last_child
last_child.next_sibling = None
for child in to_append:
@@ -257,6 +297,10 @@ class Element(html5lib.treebuilders._base.Node):
element.contents = []
element.next_element = final_next_element
# print "DONE WITH MOVE"
# print "FROM", self.element
# print "TO", new_parent_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
@@ -268,7 +312,7 @@ class Element(html5lib.treebuilders._base.Node):
return self.element.contents
def getNameTuple(self):
if self.namespace is None:
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name

View File

@@ -4,10 +4,16 @@ __all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import (
HTMLParser,
HTMLParseError,
)
from HTMLParser import HTMLParser
try:
from HTMLParser import HTMLParseError
except ImportError, e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys
import warnings
@@ -19,10 +25,10 @@ import warnings
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = (
major > 3
or (major == 3 and minor > 2)
or (major == 3 and minor == 2 and release >= 3))
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from bs4.element import (
CData,
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed.
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data):
self.soup.endData()
if data.endswith("?") and data.lower().startswith("xml"):
# "An XHTML processing instruction using the trailing '?'
# will cause the '?' to be included in data." - HTMLParser
# docs.
#
# Strip the question mark so we don't end up with two
# question marks.
data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
@@ -128,15 +127,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
features = [HTML, STRICT, HTMLPARSER]
picklable = True
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT:
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
document_declared_encoding=None, exclude_encodings=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
@@ -147,7 +150,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
return
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)

View File

@@ -7,7 +7,12 @@ from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
from bs4.element import (
Comment,
Doctype,
NamespacedAttribute,
ProcessingInstruction,
)
from bs4.builder import (
FAST,
HTML,
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
is_xml = True
NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"]
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
features = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None):
"""
:yield: A series of 4-tuples.
@@ -95,7 +104,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# the document as each one in turn.
is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(markup, try_encodings, is_html)
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.pop()
def pi(self, target, data):
pass
self.soup.endData()
self.soup.handle_data(target + ' ' + data)
self.soup.endData(ProcessingInstruction)
def data(self, content):
self.soup.handle_data(content)
@@ -212,7 +224,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
NAME = LXML
ALTERNATE_NAMES = ["lxml-html"]
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False
def default_parser(self, encoding):

View File

@@ -3,10 +3,11 @@
This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and XML, but it does not rewrite the
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
from pdb import set_trace
import codecs
from htmlentitydefs import codepoint2name
import re
@@ -212,8 +213,11 @@ class EncodingDetector:
5. Windows-1252.
"""
def __init__(self, markup, override_encodings=None, is_html=False):
def __init__(self, markup, override_encodings=None, is_html=False,
exclude_encodings=None):
self.override_encodings = override_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
self.is_html = is_html
self.declared_encoding = None
@@ -224,6 +228,8 @@ class EncodingDetector:
def _usable(self, encoding, tried):
if encoding is not None:
encoding = encoding.lower()
if encoding in self.exclude_encodings:
return False
if encoding not in tried:
tried.add(encoding)
return True
@@ -266,6 +272,9 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
if isinstance(data, unicode):
# Unicode data cannot have a byte-order mark.
return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
@@ -299,14 +308,14 @@ class EncodingDetector:
else:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii')
'ascii', 'replace')
if declared_encoding:
return declared_encoding.lower()
return None
@@ -331,13 +340,14 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
self.detector = EncodingDetector(markup, override_encodings, is_html)
self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':

View File

@@ -33,12 +33,21 @@ def diagnose(data):
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
try:
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
except ImportError, e:
print (
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
try:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
except ImportError, e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
@@ -135,7 +144,7 @@ def rword(length=5):
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
@@ -159,7 +168,7 @@ def benchmark_parsers(num_elements=100000):
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:

View File

@@ -1,3 +1,4 @@
from pdb import set_trace
import collections
import re
import sys
@@ -185,24 +186,40 @@ class PageElement(object):
return self.HTML_FORMATTERS.get(
name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None):
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
other elements."""
self.parent = parent
self.previous_element = previous_element
if previous_element is not None:
self.previous_element.next_element = self
self.next_element = None
self.previous_sibling = None
self.next_sibling = None
if self.parent is not None and self.parent.contents:
self.previous_sibling = self.parent.contents[-1]
self.next_element = next_element
if self.next_element:
self.next_element.previous_element = self
self.next_sibling = next_sibling
if self.next_sibling:
self.next_sibling.previous_sibling = self
if (not previous_sibling
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
if previous_sibling:
self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
if not self.parent:
raise ValueError(
"Cannot replace one element with another when the"
"element to be replaced is not part of a tree.")
if replace_with is self:
return
if replace_with is self.parent:
@@ -216,6 +233,10 @@ class PageElement(object):
def unwrap(self):
my_parent = self.parent
if not self.parent:
raise ValueError(
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
my_index = self.parent.index(self)
self.extract()
for child in reversed(self.contents[:]):
@@ -240,17 +261,20 @@ class PageElement(object):
last_child = self._last_descendant()
next_element = last_child.next_element
if self.previous_element is not None:
if (self.previous_element is not None and
self.previous_element != next_element):
self.previous_element.next_element = next_element
if next_element is not None:
if next_element is not None and next_element != self.previous_element:
next_element.previous_element = self.previous_element
self.previous_element = None
last_child.next_element = None
self.parent = None
if self.previous_sibling is not None:
if (self.previous_sibling is not None
and self.previous_sibling != self.next_sibling):
self.previous_sibling.next_sibling = self.next_sibling
if self.next_sibling is not None:
if (self.next_sibling is not None
and self.next_sibling != self.previous_sibling):
self.next_sibling.previous_sibling = self.previous_sibling
self.previous_sibling = self.next_sibling = None
return self
@@ -478,6 +502,10 @@ class PageElement(object):
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
if text is None and 'string' in kwargs:
text = kwargs['string']
del kwargs['string']
if isinstance(name, SoupStrainer):
strainer = name
else:
@@ -548,17 +576,17 @@ class PageElement(object):
# Methods for supporting CSS selectors.
tag_name_re = re.compile('^[a-z0-9]+$')
tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---/ \---/\-------------/ \-------/
# | | | |
# | | | The value
# | | ~,|,^,$,* or =
# | Attribute
# /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---------------------------/ \---/\-------------/ \-------/
# | | | |
# | | | The value
# | | ~,|,^,$,* or =
# | Attribute
# Tag
attribselect_re = re.compile(
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
r'=?"?(?P<value>[^\]"]*)"?\]$'
)
@@ -654,11 +682,17 @@ class NavigableString(unicode, PageElement):
how to handle non-ASCII characters.
"""
if isinstance(value, unicode):
return unicode.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
u = unicode.__new__(cls, value)
else:
u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
u.setup()
return u
def __copy__(self):
return self
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
"""
return type(self)(self)
def __getnewargs__(self):
return (unicode(self),)
@@ -707,7 +741,7 @@ class CData(PreformattedString):
class ProcessingInstruction(PreformattedString):
PREFIX = u'<?'
SUFFIX = u'?>'
SUFFIX = u'>'
class Comment(PreformattedString):
@@ -759,9 +793,12 @@ class Tag(PageElement):
self.prefix = prefix
if attrs is None:
attrs = {}
elif attrs and builder.cdata_list_attributes:
attrs = builder._replace_cdata_list_attribute_values(
self.name, attrs)
elif attrs:
if builder is not None and builder.cdata_list_attributes:
attrs = builder._replace_cdata_list_attribute_values(
self.name, attrs)
else:
attrs = dict(attrs)
else:
attrs = dict(attrs)
self.attrs = attrs
@@ -778,6 +815,18 @@ class Tag(PageElement):
parserClass = _alias("parser_class") # BS3
def __copy__(self):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
clone = type(self)(None, self.builder, self.name, self.namespace,
self.nsprefix, self.attrs)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
clone.append(child.__copy__())
return clone
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
@@ -971,15 +1020,25 @@ class Tag(PageElement):
as defined in __eq__."""
return not self == other
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
def __repr__(self, encoding="unicode-escape"):
"""Renders this tag as a string."""
return self.encode(encoding)
if PY3K:
# "The return value must be a string object", i.e. Unicode
return self.decode()
else:
# "The return value must be a string object", i.e. a bytestring.
# By convention, the return value of __repr__ should also be
# an ASCII string.
return self.encode(encoding)
def __unicode__(self):
return self.decode()
def __str__(self):
return self.encode()
if PY3K:
return self.decode()
else:
return self.encode()
if PY3K:
__str__ = __repr__ = __unicode__
@@ -1103,12 +1162,18 @@ class Tag(PageElement):
formatter="minimal"):
"""Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
indented this many spaces.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
@@ -1137,7 +1202,17 @@ class Tag(PageElement):
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Renders the contents of this tag as a bytestring."""
"""Renders the contents of this tag as a bytestring.
:param indent_level: Each line of the rendering will be
indented this many spaces.
:param eventual_encoding: The bytestring will be in this encoding.
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
contents = self.decode_contents(indent_level, encoding, formatter)
return contents.encode(encoding)
@@ -1201,63 +1276,89 @@ class Tag(PageElement):
_selector_combinators = ['>', '+', '~']
_select_debug = False
def select(self, selector, _candidate_generator=None):
def select_one(self, selector):
"""Perform a CSS selection operation on the current element."""
tokens = selector.split()
value = self.select(selector, limit=1)
if value:
return value[0]
return None
def select(self, selector, _candidate_generator=None, limit=None):
"""Perform a CSS selection operation on the current element."""
# Remove whitespace directly after the grouping operator ','
# then split into tokens.
tokens = re.sub(',[\s]*',',', selector).split()
current_context = [self]
if tokens[-1] in self._selector_combinators:
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug:
print 'Running CSS selector "%s"' % selector
for index, token in enumerate(tokens):
if self._select_debug:
print ' Considering token "%s"' % token
recursive_candidate_generator = None
tag_name = None
for index, token_group in enumerate(tokens):
new_context = []
new_context_ids = set([])
# Grouping selectors, ie: p,a
grouped_tokens = token_group.split(',')
if '' in grouped_tokens:
raise ValueError('Invalid group selection syntax: %s' % token_group)
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
continue
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the
# selector. Candidates are generated by the active
# iterator.
checker = None
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag_name, attribute, operator, value = m.groups()
checker = self._attribute_checker(operator, attribute, value)
for token in grouped_tokens:
if self._select_debug:
print ' Considering token "%s"' % token
recursive_candidate_generator = None
tag_name = None
elif '#' in token:
# ID selector
tag_name, tag_id = token.split('#', 1)
def id_matches(tag):
return tag.get('id', None) == tag_id
checker = id_matches
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the
# selector. Candidates are generated by the active
# iterator.
checker = None
elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
classes = set(klass.split('.'))
def classes_match(candidate):
return classes.issubset(candidate.get('class', []))
checker = classes_match
m = self.attribselect_re.match(token)
if m is not None:
# Attribute selector
tag_name, attribute, operator, value = m.groups()
checker = self._attribute_checker(operator, attribute, value)
elif ':' in token:
# Pseudo-class
tag_name, pseudo = token.split(':', 1)
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is not None:
pseudo_type, pseudo_value = pseudo_attributes.groups()
elif '#' in token:
# ID selector
tag_name, tag_id = token.split('#', 1)
def id_matches(tag):
return tag.get('id', None) == tag_id
checker = id_matches
elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
classes = set(klass.split('.'))
def classes_match(candidate):
return classes.issubset(candidate.get('class', []))
checker = classes_match
elif ':' in token:
# Pseudo-class
tag_name, pseudo = token.split(':', 1)
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is None:
pseudo_type = pseudo
pseudo_value = None
else:
pseudo_type, pseudo_value = pseudo_attributes.groups()
if pseudo_type == 'nth-of-type':
try:
pseudo_value = int(pseudo_value)
@@ -1286,109 +1387,110 @@ class Tag(PageElement):
raise NotImplementedError(
'Only the following pseudo-classes are implemented: nth-of-type.')
elif token == '*':
# Star selector -- matches everything
pass
elif token == '>':
# Run the next token as a CSS selector against the
# direct children of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.children
elif token == '~':
# Run the next token as a CSS selector against the
# siblings of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.next_siblings
elif token == '+':
# For each tag in the current context, run the next
# token as a CSS selector against the tag's next
# sibling that's a tag.
def next_tag_sibling(tag):
yield tag.find_next_sibling(True)
recursive_candidate_generator = next_tag_sibling
elif token == '*':
# Star selector -- matches everything
pass
elif token == '>':
# Run the next token as a CSS selector against the
# direct children of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.children
elif token == '~':
# Run the next token as a CSS selector against the
# siblings of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.next_siblings
elif token == '+':
# For each tag in the current context, run the next
# token as a CSS selector against the tag's next
# sibling that's a tag.
def next_tag_sibling(tag):
yield tag.find_next_sibling(True)
recursive_candidate_generator = next_tag_sibling
elif self.tag_name_re.match(token):
# Just a tag name.
tag_name = token
else:
raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token)
if recursive_candidate_generator:
# This happens when the selector looks like "> foo".
#
# The generator calls select() recursively on every
# member of the current context, passing in a different
# candidate generator and a different selector.
#
# In the case of "> foo", the candidate generator is
# one that yields a tag's direct children (">"), and
# the selector is "foo".
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
print '-' * 40
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
yield i
if self._select_debug:
print '-' * 40
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
# children. If tag_name is defined, only yield tags
# with that name.
if self._select_debug:
if tag_name:
check = "[any]"
else:
check = tag_name
print ' Default candidate generator, tag name="%s"' % check
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
# debug log.
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if tag_name and not child.name == tag_name:
continue
yield child
_use_candidate_generator = default_candidate_generator
elif self.tag_name_re.match(token):
# Just a tag name.
tag_name = token
else:
_use_candidate_generator = lambda tag: tag.descendants
else:
_use_candidate_generator = _candidate_generator
new_context = []
new_context_ids = set([])
for tag in current_context:
if self._select_debug:
print " Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
if tag_name and candidate.name != tag_name:
continue
if checker is not None:
try:
result = checker(candidate)
except StopIteration:
# The checker has decided we should no longer
# run the generator.
break
if checker is None or result:
raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token)
if recursive_candidate_generator:
# This happens when the selector looks like "> foo".
#
# The generator calls select() recursively on every
# member of the current context, passing in a different
# candidate generator and a different selector.
#
# In the case of "> foo", the candidate generator is
# one that yields a tag's direct children (">"), and
# the selector is "foo".
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
print '-' * 40
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
yield i
if self._select_debug:
print '-' * 40
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
# children. If tag_name is defined, only yield tags
# with that name.
if self._select_debug:
if tag_name:
check = "[any]"
else:
check = tag_name
print ' Default candidate generator, tag name="%s"' % check
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
# debug log.
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if tag_name and not child.name == tag_name:
continue
yield child
_use_candidate_generator = default_candidate_generator
else:
_use_candidate_generator = lambda tag: tag.descendants
else:
_use_candidate_generator = _candidate_generator
count = 0
for tag in current_context:
if self._select_debug:
print " Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
if tag_name and candidate.name != tag_name:
continue
if checker is not None:
try:
result = checker(candidate)
except StopIteration:
# The checker has decided we should no longer
# run the generator.
break
if checker is None or result:
if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
if limit and len(new_context) >= limit:
break
elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context

View File

@@ -1,5 +1,6 @@
"""Helper classes for tests."""
import pickle
import copy
import functools
import unittest
@@ -43,6 +44,16 @@ class SoupTest(unittest.TestCase):
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
self.assertEqual(e, earlier.next_element)
self.assertEqual(earlier, e.previous_element)
earlier = e
class HTMLTreeBuilderSmokeTest(object):
@@ -54,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation.
"""
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -114,6 +134,11 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_processing_instruction(self):
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
@@ -155,6 +180,23 @@ class HTMLTreeBuilderSmokeTest(object):
def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>")
def test_double_head(self):
html = '''<!DOCTYPE html>
<html>
<head>
<title>Ordinary HEAD element test</title>
</head>
<script type="text/javascript">
alert("Help!");
</script>
<body>
Hello, world!
</body>
</html>
'''
soup = self.soup(html)
self.assertEqual("text/javascript", soup.find('script')['type'])
def test_comment(self):
# Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>"
@@ -221,6 +263,14 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
def test_multivalued_attribute_on_html(self):
# html5lib uses a different API to set the attributes ot the
# <html> tag. This has caused problems with multivalued
# attributes.
markup = '<html class="a b"></html>'
soup = self.soup(markup)
self.assertEqual(["a", "b"], soup.html['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
@@ -253,6 +303,35 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
self.assertConnectedness(soup)
def test_head_tag_between_head_and_body(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<html><head></head>
<link></link>
<body>foo</body>
</html>
"""
soup = self.soup(content)
self.assertNotEqual(None, soup.html.body)
self.assertConnectedness(soup)
def test_multiple_copies_of_a_tag(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<!DOCTYPE html>
<html>
<body>
<article id="a" >
<div><a href="1"></div>
<footer>
<a href="2"></a>
</footer>
</article>
</body>
</html>
"""
soup = self.soup(content)
self.assertConnectedness(soup.article)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
@@ -463,6 +542,15 @@ class HTMLTreeBuilderSmokeTest(object):
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
@@ -485,7 +573,7 @@ class XMLTreeBuilderSmokeTest(object):
<script type="text/javascript">
</script>
"""
soup = BeautifulSoup(doc, "xml")
soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'

View File

@@ -20,4 +20,6 @@ from .serializer import serialize
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"]
__version__ = "0.999"
# this has to be at the top level, see how setup.py parses this
__version__ = "0.999999"

View File

@@ -1,292 +1,290 @@
from __future__ import absolute_import, division, unicode_literals
import string
import gettext
_ = gettext.gettext
EOF = None
E = {
"null-character":
_("Null character in input stream, replaced with U+FFFD."),
"Null character in input stream, replaced with U+FFFD.",
"invalid-codepoint":
_("Invalid codepoint in stream."),
"Invalid codepoint in stream.",
"incorrectly-placed-solidus":
_("Solidus (/) incorrectly placed in tag."),
"Solidus (/) incorrectly placed in tag.",
"incorrect-cr-newline-entity":
_("Incorrect CR newline entity, replaced with LF."),
"Incorrect CR newline entity, replaced with LF.",
"illegal-windows-1252-entity":
_("Entity used with illegal number (windows-1252 reference)."),
"Entity used with illegal number (windows-1252 reference).",
"cant-convert-numeric-entity":
_("Numeric entity couldn't be converted to character "
"(codepoint U+%(charAsInt)08x)."),
"Numeric entity couldn't be converted to character "
"(codepoint U+%(charAsInt)08x).",
"illegal-codepoint-for-numeric-entity":
_("Numeric entity represents an illegal codepoint: "
"U+%(charAsInt)08x."),
"Numeric entity represents an illegal codepoint: "
"U+%(charAsInt)08x.",
"numeric-entity-without-semicolon":
_("Numeric entity didn't end with ';'."),
"Numeric entity didn't end with ';'.",
"expected-numeric-entity-but-got-eof":
_("Numeric entity expected. Got end of file instead."),
"Numeric entity expected. Got end of file instead.",
"expected-numeric-entity":
_("Numeric entity expected but none found."),
"Numeric entity expected but none found.",
"named-entity-without-semicolon":
_("Named entity didn't end with ';'."),
"Named entity didn't end with ';'.",
"expected-named-entity":
_("Named entity expected. Got none."),
"Named entity expected. Got none.",
"attributes-in-end-tag":
_("End tag contains unexpected attributes."),
"End tag contains unexpected attributes.",
'self-closing-flag-on-end-tag':
_("End tag contains unexpected self-closing flag."),
"End tag contains unexpected self-closing flag.",
"expected-tag-name-but-got-right-bracket":
_("Expected tag name. Got '>' instead."),
"Expected tag name. Got '>' instead.",
"expected-tag-name-but-got-question-mark":
_("Expected tag name. Got '?' instead. (HTML doesn't "
"support processing instructions.)"),
"Expected tag name. Got '?' instead. (HTML doesn't "
"support processing instructions.)",
"expected-tag-name":
_("Expected tag name. Got something else instead"),
"Expected tag name. Got something else instead",
"expected-closing-tag-but-got-right-bracket":
_("Expected closing tag. Got '>' instead. Ignoring '</>'."),
"Expected closing tag. Got '>' instead. Ignoring '</>'.",
"expected-closing-tag-but-got-eof":
_("Expected closing tag. Unexpected end of file."),
"Expected closing tag. Unexpected end of file.",
"expected-closing-tag-but-got-char":
_("Expected closing tag. Unexpected character '%(data)s' found."),
"Expected closing tag. Unexpected character '%(data)s' found.",
"eof-in-tag-name":
_("Unexpected end of file in the tag name."),
"Unexpected end of file in the tag name.",
"expected-attribute-name-but-got-eof":
_("Unexpected end of file. Expected attribute name instead."),
"Unexpected end of file. Expected attribute name instead.",
"eof-in-attribute-name":
_("Unexpected end of file in attribute name."),
"Unexpected end of file in attribute name.",
"invalid-character-in-attribute-name":
_("Invalid character in attribute name"),
"Invalid character in attribute name",
"duplicate-attribute":
_("Dropped duplicate attribute on tag."),
"Dropped duplicate attribute on tag.",
"expected-end-of-tag-name-but-got-eof":
_("Unexpected end of file. Expected = or end of tag."),
"Unexpected end of file. Expected = or end of tag.",
"expected-attribute-value-but-got-eof":
_("Unexpected end of file. Expected attribute value."),
"Unexpected end of file. Expected attribute value.",
"expected-attribute-value-but-got-right-bracket":
_("Expected attribute value. Got '>' instead."),
"Expected attribute value. Got '>' instead.",
'equals-in-unquoted-attribute-value':
_("Unexpected = in unquoted attribute"),
"Unexpected = in unquoted attribute",
'unexpected-character-in-unquoted-attribute-value':
_("Unexpected character in unquoted attribute"),
"Unexpected character in unquoted attribute",
"invalid-character-after-attribute-name":
_("Unexpected character after attribute name."),
"Unexpected character after attribute name.",
"unexpected-character-after-attribute-value":
_("Unexpected character after attribute value."),
"Unexpected character after attribute value.",
"eof-in-attribute-value-double-quote":
_("Unexpected end of file in attribute value (\")."),
"Unexpected end of file in attribute value (\").",
"eof-in-attribute-value-single-quote":
_("Unexpected end of file in attribute value (')."),
"Unexpected end of file in attribute value (').",
"eof-in-attribute-value-no-quotes":
_("Unexpected end of file in attribute value."),
"Unexpected end of file in attribute value.",
"unexpected-EOF-after-solidus-in-tag":
_("Unexpected end of file in tag. Expected >"),
"Unexpected end of file in tag. Expected >",
"unexpected-character-after-solidus-in-tag":
_("Unexpected character after / in tag. Expected >"),
"Unexpected character after / in tag. Expected >",
"expected-dashes-or-doctype":
_("Expected '--' or 'DOCTYPE'. Not found."),
"Expected '--' or 'DOCTYPE'. Not found.",
"unexpected-bang-after-double-dash-in-comment":
_("Unexpected ! after -- in comment"),
"Unexpected ! after -- in comment",
"unexpected-space-after-double-dash-in-comment":
_("Unexpected space after -- in comment"),
"Unexpected space after -- in comment",
"incorrect-comment":
_("Incorrect comment."),
"Incorrect comment.",
"eof-in-comment":
_("Unexpected end of file in comment."),
"Unexpected end of file in comment.",
"eof-in-comment-end-dash":
_("Unexpected end of file in comment (-)"),
"Unexpected end of file in comment (-)",
"unexpected-dash-after-double-dash-in-comment":
_("Unexpected '-' after '--' found in comment."),
"Unexpected '-' after '--' found in comment.",
"eof-in-comment-double-dash":
_("Unexpected end of file in comment (--)."),
"Unexpected end of file in comment (--).",
"eof-in-comment-end-space-state":
_("Unexpected end of file in comment."),
"Unexpected end of file in comment.",
"eof-in-comment-end-bang-state":
_("Unexpected end of file in comment."),
"Unexpected end of file in comment.",
"unexpected-char-in-comment":
_("Unexpected character in comment found."),
"Unexpected character in comment found.",
"need-space-after-doctype":
_("No space after literal string 'DOCTYPE'."),
"No space after literal string 'DOCTYPE'.",
"expected-doctype-name-but-got-right-bracket":
_("Unexpected > character. Expected DOCTYPE name."),
"Unexpected > character. Expected DOCTYPE name.",
"expected-doctype-name-but-got-eof":
_("Unexpected end of file. Expected DOCTYPE name."),
"Unexpected end of file. Expected DOCTYPE name.",
"eof-in-doctype-name":
_("Unexpected end of file in DOCTYPE name."),
"Unexpected end of file in DOCTYPE name.",
"eof-in-doctype":
_("Unexpected end of file in DOCTYPE."),
"Unexpected end of file in DOCTYPE.",
"expected-space-or-right-bracket-in-doctype":
_("Expected space or '>'. Got '%(data)s'"),
"Expected space or '>'. Got '%(data)s'",
"unexpected-end-of-doctype":
_("Unexpected end of DOCTYPE."),
"Unexpected end of DOCTYPE.",
"unexpected-char-in-doctype":
_("Unexpected character in DOCTYPE."),
"Unexpected character in DOCTYPE.",
"eof-in-innerhtml":
_("XXX innerHTML EOF"),
"XXX innerHTML EOF",
"unexpected-doctype":
_("Unexpected DOCTYPE. Ignored."),
"Unexpected DOCTYPE. Ignored.",
"non-html-root":
_("html needs to be the first start tag."),
"html needs to be the first start tag.",
"expected-doctype-but-got-eof":
_("Unexpected End of file. Expected DOCTYPE."),
"Unexpected End of file. Expected DOCTYPE.",
"unknown-doctype":
_("Erroneous DOCTYPE."),
"Erroneous DOCTYPE.",
"expected-doctype-but-got-chars":
_("Unexpected non-space characters. Expected DOCTYPE."),
"Unexpected non-space characters. Expected DOCTYPE.",
"expected-doctype-but-got-start-tag":
_("Unexpected start tag (%(name)s). Expected DOCTYPE."),
"Unexpected start tag (%(name)s). Expected DOCTYPE.",
"expected-doctype-but-got-end-tag":
_("Unexpected end tag (%(name)s). Expected DOCTYPE."),
"Unexpected end tag (%(name)s). Expected DOCTYPE.",
"end-tag-after-implied-root":
_("Unexpected end tag (%(name)s) after the (implied) root element."),
"Unexpected end tag (%(name)s) after the (implied) root element.",
"expected-named-closing-tag-but-got-eof":
_("Unexpected end of file. Expected end tag (%(name)s)."),
"Unexpected end of file. Expected end tag (%(name)s).",
"two-heads-are-not-better-than-one":
_("Unexpected start tag head in existing head. Ignored."),
"Unexpected start tag head in existing head. Ignored.",
"unexpected-end-tag":
_("Unexpected end tag (%(name)s). Ignored."),
"Unexpected end tag (%(name)s). Ignored.",
"unexpected-start-tag-out-of-my-head":
_("Unexpected start tag (%(name)s) that can be in head. Moved."),
"Unexpected start tag (%(name)s) that can be in head. Moved.",
"unexpected-start-tag":
_("Unexpected start tag (%(name)s)."),
"Unexpected start tag (%(name)s).",
"missing-end-tag":
_("Missing end tag (%(name)s)."),
"Missing end tag (%(name)s).",
"missing-end-tags":
_("Missing end tags (%(name)s)."),
"Missing end tags (%(name)s).",
"unexpected-start-tag-implies-end-tag":
_("Unexpected start tag (%(startName)s) "
"implies end tag (%(endName)s)."),
"Unexpected start tag (%(startName)s) "
"implies end tag (%(endName)s).",
"unexpected-start-tag-treated-as":
_("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
"Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
"deprecated-tag":
_("Unexpected start tag %(name)s. Don't use it!"),
"Unexpected start tag %(name)s. Don't use it!",
"unexpected-start-tag-ignored":
_("Unexpected start tag %(name)s. Ignored."),
"Unexpected start tag %(name)s. Ignored.",
"expected-one-end-tag-but-got-another":
_("Unexpected end tag (%(gotName)s). "
"Missing end tag (%(expectedName)s)."),
"Unexpected end tag (%(gotName)s). "
"Missing end tag (%(expectedName)s).",
"end-tag-too-early":
_("End tag (%(name)s) seen too early. Expected other end tag."),
"End tag (%(name)s) seen too early. Expected other end tag.",
"end-tag-too-early-named":
_("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
"end-tag-too-early-ignored":
_("End tag (%(name)s) seen too early. Ignored."),
"End tag (%(name)s) seen too early. Ignored.",
"adoption-agency-1.1":
_("End tag (%(name)s) violates step 1, "
"paragraph 1 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 1, "
"paragraph 1 of the adoption agency algorithm.",
"adoption-agency-1.2":
_("End tag (%(name)s) violates step 1, "
"paragraph 2 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 1, "
"paragraph 2 of the adoption agency algorithm.",
"adoption-agency-1.3":
_("End tag (%(name)s) violates step 1, "
"paragraph 3 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 1, "
"paragraph 3 of the adoption agency algorithm.",
"adoption-agency-4.4":
_("End tag (%(name)s) violates step 4, "
"paragraph 4 of the adoption agency algorithm."),
"End tag (%(name)s) violates step 4, "
"paragraph 4 of the adoption agency algorithm.",
"unexpected-end-tag-treated-as":
_("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
"Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
"no-end-tag":
_("This element (%(name)s) has no end tag."),
"This element (%(name)s) has no end tag.",
"unexpected-implied-end-tag-in-table":
_("Unexpected implied end tag (%(name)s) in the table phase."),
"Unexpected implied end tag (%(name)s) in the table phase.",
"unexpected-implied-end-tag-in-table-body":
_("Unexpected implied end tag (%(name)s) in the table body phase."),
"Unexpected implied end tag (%(name)s) in the table body phase.",
"unexpected-char-implies-table-voodoo":
_("Unexpected non-space characters in "
"table context caused voodoo mode."),
"Unexpected non-space characters in "
"table context caused voodoo mode.",
"unexpected-hidden-input-in-table":
_("Unexpected input with type hidden in table context."),
"Unexpected input with type hidden in table context.",
"unexpected-form-in-table":
_("Unexpected form in table context."),
"Unexpected form in table context.",
"unexpected-start-tag-implies-table-voodoo":
_("Unexpected start tag (%(name)s) in "
"table context caused voodoo mode."),
"Unexpected start tag (%(name)s) in "
"table context caused voodoo mode.",
"unexpected-end-tag-implies-table-voodoo":
_("Unexpected end tag (%(name)s) in "
"table context caused voodoo mode."),
"Unexpected end tag (%(name)s) in "
"table context caused voodoo mode.",
"unexpected-cell-in-table-body":
_("Unexpected table cell start tag (%(name)s) "
"in the table body phase."),
"Unexpected table cell start tag (%(name)s) "
"in the table body phase.",
"unexpected-cell-end-tag":
_("Got table cell end tag (%(name)s) "
"while required end tags are missing."),
"Got table cell end tag (%(name)s) "
"while required end tags are missing.",
"unexpected-end-tag-in-table-body":
_("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
"Unexpected end tag (%(name)s) in the table body phase. Ignored.",
"unexpected-implied-end-tag-in-table-row":
_("Unexpected implied end tag (%(name)s) in the table row phase."),
"Unexpected implied end tag (%(name)s) in the table row phase.",
"unexpected-end-tag-in-table-row":
_("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
"Unexpected end tag (%(name)s) in the table row phase. Ignored.",
"unexpected-select-in-select":
_("Unexpected select start tag in the select phase "
"treated as select end tag."),
"Unexpected select start tag in the select phase "
"treated as select end tag.",
"unexpected-input-in-select":
_("Unexpected input start tag in the select phase."),
"Unexpected input start tag in the select phase.",
"unexpected-start-tag-in-select":
_("Unexpected start tag token (%(name)s in the select phase. "
"Ignored."),
"Unexpected start tag token (%(name)s in the select phase. "
"Ignored.",
"unexpected-end-tag-in-select":
_("Unexpected end tag (%(name)s) in the select phase. Ignored."),
"Unexpected end tag (%(name)s) in the select phase. Ignored.",
"unexpected-table-element-start-tag-in-select-in-table":
_("Unexpected table element start tag (%(name)s) in the select in table phase."),
"Unexpected table element start tag (%(name)s) in the select in table phase.",
"unexpected-table-element-end-tag-in-select-in-table":
_("Unexpected table element end tag (%(name)s) in the select in table phase."),
"Unexpected table element end tag (%(name)s) in the select in table phase.",
"unexpected-char-after-body":
_("Unexpected non-space characters in the after body phase."),
"Unexpected non-space characters in the after body phase.",
"unexpected-start-tag-after-body":
_("Unexpected start tag token (%(name)s)"
" in the after body phase."),
"Unexpected start tag token (%(name)s)"
" in the after body phase.",
"unexpected-end-tag-after-body":
_("Unexpected end tag token (%(name)s)"
" in the after body phase."),
"Unexpected end tag token (%(name)s)"
" in the after body phase.",
"unexpected-char-in-frameset":
_("Unexpected characters in the frameset phase. Characters ignored."),
"Unexpected characters in the frameset phase. Characters ignored.",
"unexpected-start-tag-in-frameset":
_("Unexpected start tag token (%(name)s)"
" in the frameset phase. Ignored."),
"Unexpected start tag token (%(name)s)"
" in the frameset phase. Ignored.",
"unexpected-frameset-in-frameset-innerhtml":
_("Unexpected end tag token (frameset) "
"in the frameset phase (innerHTML)."),
"Unexpected end tag token (frameset) "
"in the frameset phase (innerHTML).",
"unexpected-end-tag-in-frameset":
_("Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored."),
"Unexpected end tag token (%(name)s)"
" in the frameset phase. Ignored.",
"unexpected-char-after-frameset":
_("Unexpected non-space characters in the "
"after frameset phase. Ignored."),
"Unexpected non-space characters in the "
"after frameset phase. Ignored.",
"unexpected-start-tag-after-frameset":
_("Unexpected start tag (%(name)s)"
" in the after frameset phase. Ignored."),
"Unexpected start tag (%(name)s)"
" in the after frameset phase. Ignored.",
"unexpected-end-tag-after-frameset":
_("Unexpected end tag (%(name)s)"
" in the after frameset phase. Ignored."),
"Unexpected end tag (%(name)s)"
" in the after frameset phase. Ignored.",
"unexpected-end-tag-after-body-innerhtml":
_("Unexpected end tag after body(innerHtml)"),
"Unexpected end tag after body(innerHtml)",
"expected-eof-but-got-char":
_("Unexpected non-space characters. Expected end of file."),
"Unexpected non-space characters. Expected end of file.",
"expected-eof-but-got-start-tag":
_("Unexpected start tag (%(name)s)"
". Expected end of file."),
"Unexpected start tag (%(name)s)"
". Expected end of file.",
"expected-eof-but-got-end-tag":
_("Unexpected end tag (%(name)s)"
". Expected end of file."),
"Unexpected end tag (%(name)s)"
". Expected end of file.",
"eof-in-table":
_("Unexpected end of file. Expected table content."),
"Unexpected end of file. Expected table content.",
"eof-in-select":
_("Unexpected end of file. Expected select content."),
"Unexpected end of file. Expected select content.",
"eof-in-frameset":
_("Unexpected end of file. Expected frameset content."),
"Unexpected end of file. Expected frameset content.",
"eof-in-script-in-script":
_("Unexpected end of file. Expected script content."),
"Unexpected end of file. Expected script content.",
"eof-in-foreign-lands":
_("Unexpected end of file. Expected foreign content"),
"Unexpected end of file. Expected foreign content",
"non-void-element-with-trailing-solidus":
_("Trailing solidus not allowed on element %(name)s"),
"Trailing solidus not allowed on element %(name)s",
"unexpected-html-element-in-foreign-content":
_("Element %(name)s not allowed in a non-html context"),
"Element %(name)s not allowed in a non-html context",
"unexpected-end-tag-before-html":
_("Unexpected end tag (%(name)s) before html."),
"Unexpected end tag (%(name)s) before html.",
"XXX-undefined-error":
_("Undefined error (this sucks and should be fixed)"),
"Undefined error (this sucks and should be fixed)",
}
namespaces = {
@@ -298,7 +296,7 @@ namespaces = {
"xmlns": "http://www.w3.org/2000/xmlns/"
}
scopingElements = frozenset((
scopingElements = frozenset([
(namespaces["html"], "applet"),
(namespaces["html"], "caption"),
(namespaces["html"], "html"),
@@ -316,9 +314,9 @@ scopingElements = frozenset((
(namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"),
(namespaces["svg"], "title"),
))
])
formattingElements = frozenset((
formattingElements = frozenset([
(namespaces["html"], "a"),
(namespaces["html"], "b"),
(namespaces["html"], "big"),
@@ -333,9 +331,9 @@ formattingElements = frozenset((
(namespaces["html"], "strong"),
(namespaces["html"], "tt"),
(namespaces["html"], "u")
))
])
specialElements = frozenset((
specialElements = frozenset([
(namespaces["html"], "address"),
(namespaces["html"], "applet"),
(namespaces["html"], "area"),
@@ -416,22 +414,22 @@ specialElements = frozenset((
(namespaces["html"], "wbr"),
(namespaces["html"], "xmp"),
(namespaces["svg"], "foreignObject")
))
])
htmlIntegrationPointElements = frozenset((
htmlIntegrationPointElements = frozenset([
(namespaces["mathml"], "annotaion-xml"),
(namespaces["svg"], "foreignObject"),
(namespaces["svg"], "desc"),
(namespaces["svg"], "title")
))
])
mathmlTextIntegrationPointElements = frozenset((
mathmlTextIntegrationPointElements = frozenset([
(namespaces["mathml"], "mi"),
(namespaces["mathml"], "mo"),
(namespaces["mathml"], "mn"),
(namespaces["mathml"], "ms"),
(namespaces["mathml"], "mtext")
))
])
adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
@@ -451,21 +449,21 @@ adjustForeignAttributes = {
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()])
spaceCharacters = frozenset((
spaceCharacters = frozenset([
"\t",
"\n",
"\u000C",
" ",
"\r"
))
])
tableInsertModeElements = frozenset((
tableInsertModeElements = frozenset([
"table",
"tbody",
"tfoot",
"thead",
"tr"
))
])
asciiLowercase = frozenset(string.ascii_lowercase)
asciiUppercase = frozenset(string.ascii_uppercase)
@@ -486,7 +484,7 @@ headingElements = (
"h6"
)
voidElements = frozenset((
voidElements = frozenset([
"base",
"command",
"event-source",
@@ -502,11 +500,11 @@ voidElements = frozenset((
"input",
"source",
"track"
))
])
cdataElements = frozenset(('title', 'textarea'))
cdataElements = frozenset(['title', 'textarea'])
rcdataElements = frozenset((
rcdataElements = frozenset([
'style',
'script',
'xmp',
@@ -514,27 +512,27 @@ rcdataElements = frozenset((
'noembed',
'noframes',
'noscript'
))
])
booleanAttributes = {
"": frozenset(("irrelevant",)),
"style": frozenset(("scoped",)),
"img": frozenset(("ismap",)),
"audio": frozenset(("autoplay", "controls")),
"video": frozenset(("autoplay", "controls")),
"script": frozenset(("defer", "async")),
"details": frozenset(("open",)),
"datagrid": frozenset(("multiple", "disabled")),
"command": frozenset(("hidden", "disabled", "checked", "default")),
"hr": frozenset(("noshade")),
"menu": frozenset(("autosubmit",)),
"fieldset": frozenset(("disabled", "readonly")),
"option": frozenset(("disabled", "readonly", "selected")),
"optgroup": frozenset(("disabled", "readonly")),
"button": frozenset(("disabled", "autofocus")),
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
"output": frozenset(("disabled", "readonly")),
"": frozenset(["irrelevant"]),
"style": frozenset(["scoped"]),
"img": frozenset(["ismap"]),
"audio": frozenset(["autoplay", "controls"]),
"video": frozenset(["autoplay", "controls"]),
"script": frozenset(["defer", "async"]),
"details": frozenset(["open"]),
"datagrid": frozenset(["multiple", "disabled"]),
"command": frozenset(["hidden", "disabled", "checked", "default"]),
"hr": frozenset(["noshade"]),
"menu": frozenset(["autosubmit"]),
"fieldset": frozenset(["disabled", "readonly"]),
"option": frozenset(["disabled", "readonly", "selected"]),
"optgroup": frozenset(["disabled", "readonly"]),
"button": frozenset(["disabled", "autofocus"]),
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
"output": frozenset(["disabled", "readonly"]),
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
@@ -574,7 +572,7 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
entities = {
"AElig": "\xc6",
@@ -3088,8 +3086,8 @@ tokenTypes = {
"ParseError": 7
}
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]))
tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]])
prefixes = dict([(v, k) for k, v in namespaces.items()])

View File

@@ -1,8 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from gettext import gettext
_ = gettext
from . import _base
from ..constants import cdataElements, rcdataElements, voidElements
@@ -23,24 +20,24 @@ class Filter(_base.Filter):
if type in ("StartTag", "EmptyTag"):
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
raise LintError(_("Empty tag name"))
raise LintError("Empty tag name")
if type == "StartTag" and name in voidElements:
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
elif type == "EmptyTag" and name not in voidElements:
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
if not name:
raise LintError(_("Empty attribute name"))
raise LintError("Empty attribute name")
if not isinstance(value, str):
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
@@ -51,43 +48,43 @@ class Filter(_base.Filter):
elif type == "EndTag":
name = token["name"]
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
raise LintError(_("Empty tag name"))
raise LintError("Empty tag name")
if name in voidElements:
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
start_name = open_elements.pop()
if start_name != name:
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
contentModelFlag = "PCDATA"
elif type == "Comment":
if contentModelFlag != "PCDATA":
raise LintError(_("Comment not in PCDATA content model flag"))
raise LintError("Comment not in PCDATA content model flag")
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, str):
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
if not data:
raise LintError(_("%(type)s token with empty data") % {"type": type})
raise LintError("%(type)s token with empty data" % {"type": type})
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
if not isinstance(name, str):
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
raise LintError("Unknown token type: %(type)s" % {"type": type})
yield token

View File

@@ -18,6 +18,7 @@ from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
from .constants import E
def parse(doc, treebuilder="etree", encoding=None,
@@ -129,6 +130,17 @@ class HTMLParser(object):
self.framesetOK = True
@property
def documentEncoding(self):
"""The name of the character encoding
that was used to decode the input stream,
or :obj:`None` if that is not determined yet.
"""
if not hasattr(self, 'tokenizer'):
return None
return self.tokenizer.stream.charEncoding[0]
def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and
element.namespace == namespaces["mathml"]):
@@ -245,7 +257,7 @@ class HTMLParser(object):
# XXX The idea is to make errorcode mandatory.
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
raise ParseError
raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
@@ -868,7 +880,7 @@ def getPhases(debug):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"noframes", "script", "style", "title"),
"script", "style", "title"),
self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
@@ -1205,8 +1217,7 @@ def getPhases(debug):
attributes["name"] = "isindex"
self.processStartTag(impliedTagToken("input", "StartTag",
attributes=attributes,
selfClosing=
token["selfClosing"]))
selfClosing=token["selfClosing"]))
self.processEndTag(impliedTagToken("label"))
self.processStartTag(impliedTagToken("hr", "StartTag"))
self.processEndTag(impliedTagToken("form"))

View File

@@ -28,7 +28,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
eval('"\\uD800-\\uDFFF"'))
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,18 @@ class HTMLUnicodeInputStream(object):
"""
# Craziness
if len("\U0010FFFF") == 1:
if not utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
self.replaceCharactersRegexp = re.compile(
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
# List of where new lines occur
self.newLines = [0]
@@ -265,11 +281,12 @@ class HTMLUnicodeInputStream(object):
self._bufferedCharacter = data[-1]
data = data[:-1]
self.reportCharacterErrors(data)
if self.reportCharacterErrors:
self.reportCharacterErrors(data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")

View File

@@ -2,11 +2,26 @@ from __future__ import absolute_import, division, unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from .tokenizer import HTMLTokenizer
from .constants import tokenTypes
content_type_rgx = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
@@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object):
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
'xlink:href', 'xml:base']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs']
'ssh', 'sftp', 'rtsp', 'afs', 'data']
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
allowed_content_types = acceptable_content_types
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ class HTMLSanitizerMixin(object):
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
uri = urlparse.urlparse(val_unescaped)
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = content_type_rgx.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
@@ -245,7 +270,7 @@ class HTMLSanitizerMixin(object):
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
if keyword not in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break
else:

View File

@@ -1,9 +1,6 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
import gettext
_ = gettext.gettext
try:
from functools import reduce
except ImportError:
@@ -35,7 +32,7 @@ else:
v = utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if not v in encode_entity_map or k.islower():
if v not in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
@@ -208,7 +205,7 @@ class HTMLSerializer(object):
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
self.serializeError("System identifer contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'
@@ -220,7 +217,7 @@ class HTMLSerializer(object):
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA"))
self.serializeError("Unexpected </ in CDATA")
yield self.encode(token["data"])
else:
yield self.encode(escape(token["data"]))
@@ -231,7 +228,7 @@ class HTMLSerializer(object):
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
self.serializeError("Unexpected child element of a CDATA element")
for (attr_namespace, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here
k = attr_name
@@ -279,20 +276,20 @@ class HTMLSerializer(object):
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
self.serializeError("Unexpected child element of a CDATA element")
yield self.encodeStrict("</%s>" % name)
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError(_("Comment contains --"))
self.serializeError("Comment contains --")
yield self.encodeStrict("<!--%s-->" % token["data"])
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if key not in entities:
self.serializeError("Entity %s not recognized" % name)
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:

View File

@@ -158,7 +158,7 @@ def getDomBuilder(DomImplementation):
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types:
if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))

View File

@@ -10,8 +10,12 @@ returning an iterator generating tokens.
from __future__ import absolute_import, division, unicode_literals
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
"pulldom"]
import sys
from .. import constants
from ..utils import default_etree
treeWalkerCache = {}
@@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
# XXX: NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)
def concatenateCharacterTokens(tokens):
pendingCharacters = []
for token in tokens:
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
pendingCharacters.append(token["data"])
else:
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
pendingCharacters = []
yield token
if pendingCharacters:
yield {"type": "Characters", "data": "".join(pendingCharacters)}
def pprint(walker):
"""Pretty printer for tree walkers"""
output = []
indent = 0
for token in concatenateCharacterTokens(walker):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
# tag name
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
if token["namespace"] in constants.prefixes:
ns = constants.prefixes[token["namespace"]]
else:
ns = token["namespace"]
name = "%s %s" % (ns, token["name"])
else:
name = token["name"]
output.append("%s<%s>" % (" " * indent, name))
indent += 2
# attributes (sorted for consistent ordering)
attrs = token["data"]
for (namespace, localname), value in sorted(attrs.items()):
if namespace:
if namespace in constants.prefixes:
ns = constants.prefixes[namespace]
else:
ns = namespace
name = "%s %s" % (ns, localname)
else:
name = localname
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
# self-closing
if type == "EmptyTag":
indent -= 2
elif type == "EndTag":
indent -= 2
elif type == "Comment":
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
elif type == "Doctype":
if token["name"]:
if token["publicId"]:
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
(" " * indent,
token["name"],
token["publicId"],
token["systemId"] if token["systemId"] else ""))
elif token["systemId"]:
output.append("""%s<!DOCTYPE %s "" "%s">""" %
(" " * indent,
token["name"],
token["systemId"]))
else:
output.append("%s<!DOCTYPE %s>" % (" " * indent,
token["name"]))
else:
output.append("%s<!DOCTYPE >" % (" " * indent,))
elif type == "Characters":
output.append("%s\"%s\"" % (" " * indent, token["data"]))
elif type == "SpaceCharacters":
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
else:
raise ValueError("Unknown token type, %s" % type)
return "\n".join(output)

View File

@@ -1,8 +1,8 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types
import gettext
_ = gettext.gettext
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
from xml.dom import Node
@@ -58,7 +58,7 @@ class TreeWalker(object):
"namespace": to_text(namespace),
"data": attrs}
if hasChildren:
yield self.error(_("Void element has children"))
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
@@ -122,7 +122,7 @@ class TreeWalker(object):
return {"type": "Entity", "name": text_type(name)}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
return self.error("Unknown node type: " + nodeType)
class NonRecursiveTreeWalker(TreeWalker):

View File

@@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node
import gettext
_ = gettext.gettext
from . import _base

View File

@@ -7,12 +7,10 @@ except ImportError:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
import gettext
_ = gettext.gettext
import re
from six import text_type
from six import string_types
from . import _base
from ..utils import moduleFactoryFactory
@@ -60,7 +58,7 @@ def getETreeBuilder(ElementTreeImplementation):
return _base.COMMENT, node.text
else:
assert type(node.tag) == text_type, type(node.tag)
assert isinstance(node.tag, string_types), type(node.tag)
# This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:

View File

@@ -4,9 +4,6 @@ from six import text_type
from lxml import etree
from ..treebuilders.etree import tag_regexp
from gettext import gettext
_ = gettext
from . import _base
from .. import ihatexml
@@ -130,7 +127,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return _base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root):
@@ -169,7 +166,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs, len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children")
assert not isinstance(node, tuple), "Text nodes have no children"
assert len(node) or node.text, "Node has no children"
if node.text:
@@ -180,7 +177,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
@@ -196,7 +193,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
if key == "text":
return node
# else: fallback to "normal" processing

View File

@@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
from six import text_type
try:
import xml.etree.cElementTree as default_etree
except ImportError:
@@ -9,7 +11,26 @@ except ImportError:
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as
# Jython. This is because UTF-16 itself is based on the use of such
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"')
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"')
assert isinstance(_x, text_type)
except:
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
class MethodDispatcher(dict):