mirror of
https://github.com/rembo10/headphones.git
synced 2026-05-18 09:35:30 +01:00
rutracker revision
- Now uses requests with more logging - Update to latest BeautifulSoup and html5lib libs
This commit is contained in:
223
headphones/rutracker.py
Normal file
223
headphones/rutracker.py
Normal file
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import urllib
|
||||
import requests as requests
|
||||
from urlparse import urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import headphones
|
||||
from headphones import logger
|
||||
|
||||
class Rutracker(object):
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.session()
|
||||
self.timeout = 60
|
||||
self.loggedin = False
|
||||
self.maxsize = 0
|
||||
self.search_referer = 'http://rutracker.org/forum/tracker.php'
|
||||
|
||||
def logged_in(self):
|
||||
return self.loggedin
|
||||
|
||||
def still_logged_in(self, html):
|
||||
if not html or "action=\"http://login.rutracker.org/forum/login.php\">" in html:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def login(self):
|
||||
"""
|
||||
Logs in user
|
||||
"""
|
||||
|
||||
loginpage = 'http://login.rutracker.org/forum/login.php'
|
||||
post_params = {
|
||||
'login_username': headphones.CONFIG.RUTRACKER_USER,
|
||||
'login_password': headphones.CONFIG.RUTRACKER_PASSWORD,
|
||||
'login': b'\xc2\xf5\xee\xe4' # '%C2%F5%EE%E4'
|
||||
}
|
||||
|
||||
logger.info("Attempting to log in to rutracker...")
|
||||
|
||||
# User agent doesn't seem to matter?
|
||||
#self.headers['User-Agent'] = self.useragents[random.randrange(0, len(self.useragents))]
|
||||
try:
|
||||
r = self.session.post(loginpage, data=post_params, timeout=self.timeout)
|
||||
if r.status_code != 200:
|
||||
logger.error("rutracker login returned status code %s" % r.status_code)
|
||||
self.loggedin = False
|
||||
else:
|
||||
if 'bb_data' in r.cookies.keys():
|
||||
self.loggedin = True
|
||||
logger.info("Successfully logged in to rutracker")
|
||||
else:
|
||||
logger.error("Could not login to rutracker, credentials maybe incorrect, " /
|
||||
"site is down or too many attempts")
|
||||
self.loggedin = False
|
||||
return self.loggedin
|
||||
except Exception as e:
|
||||
logger.error("Unknown error logging in to rutracker: %s" % e)
|
||||
self.loggedin = False
|
||||
return self.loggedin
|
||||
|
||||
def searchurl(self, artist, album, year, format):
|
||||
"""
|
||||
Return the search url
|
||||
"""
|
||||
|
||||
# Build search url
|
||||
searchterm = ''
|
||||
if artist != 'Various Artists':
|
||||
searchterm = artist
|
||||
searchterm = searchterm + ' '
|
||||
searchterm = searchterm + album
|
||||
searchterm = searchterm + ' '
|
||||
searchterm = searchterm + year
|
||||
|
||||
if format == 'lossless':
|
||||
format = '+lossless'
|
||||
self.maxsize = 10000000000
|
||||
elif format == 'lossless+mp3':
|
||||
format = '+lossless||mp3||aac'
|
||||
self.maxsize = 10000000000
|
||||
else:
|
||||
format = '+mp3||aac'
|
||||
self.maxsize = 300000000
|
||||
|
||||
# sort by size, descending.
|
||||
sort = '&o=7&s=2'
|
||||
|
||||
searchurl = "%s?nm=%s%s%s" % (self.search_referer, urllib.quote(searchterm), format, sort)
|
||||
|
||||
logger.info("Searching rutracker using term: %s", searchterm)
|
||||
|
||||
return searchurl
|
||||
|
||||
def search(self, searchurl):
|
||||
"""
|
||||
Parse the search results and return valid torrent list
|
||||
"""
|
||||
|
||||
try:
|
||||
headers = {'Referer': self.search_referer}
|
||||
r = self.session.get(url=searchurl, headers=headers, timeout=self.timeout)
|
||||
|
||||
soup = BeautifulSoup(r.content, 'html5lib')
|
||||
|
||||
# Debug
|
||||
#logger.debug (soup.prettify())
|
||||
|
||||
# Check if still logged in
|
||||
if not self.still_logged_in(soup):
|
||||
self.login()
|
||||
r = self.session.get(url=searchurl, timeout=self.timeout)
|
||||
soup = BeautifulSoup(r.content, 'html5lib')
|
||||
if not self.still_logged_in(soup):
|
||||
logger.error("Error getting rutracker data")
|
||||
return None
|
||||
|
||||
# Process
|
||||
rulist = []
|
||||
i = soup.find('table', id='tor-tbl')
|
||||
if not i:
|
||||
logger.info("No valid results found from rutracker")
|
||||
return None
|
||||
minimumseeders = int(headphones.CONFIG.NUMBEROFSEEDERS) - 1
|
||||
|
||||
for item in zip(i.find_all(class_='hl-tags'),i.find_all(class_='dl-stub'),i.find_all(class_='seedmed')):
|
||||
title = item[0].get_text()
|
||||
url = item[1].get('href')
|
||||
size_formatted = item[1].get_text()[:-2]
|
||||
seeds = item[2].get_text()
|
||||
size_parts = size_formatted.split()
|
||||
size = float(size_parts[0])
|
||||
|
||||
if size_parts[1] == 'KB':
|
||||
size *= 1024
|
||||
if size_parts[1] == 'MB':
|
||||
size *= 1024 ** 2
|
||||
if size_parts[1] == 'GB':
|
||||
size *= 1024 ** 3
|
||||
if size_parts[1] == 'TB':
|
||||
size *= 1024 ** 4
|
||||
|
||||
if size < self.maxsize and minimumseeders < int(seeds):
|
||||
logger.info('Found %s. Size: %s' % (title, size_formatted))
|
||||
#Torrent topic page
|
||||
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
|
||||
topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
|
||||
rulist.append((title, size, topicurl, 'rutracker.org', 'torrent', True))
|
||||
else:
|
||||
logger.info("%s is larger than the maxsize or has too little seeders for this category, " \
|
||||
"skipping. (Size: %i bytes, Seeders: %i)" % (title, size, int(seeds)))
|
||||
|
||||
if not rulist:
|
||||
logger.info("No valid results found from rutracker")
|
||||
|
||||
return rulist
|
||||
|
||||
except Exception as e:
|
||||
logger.error("An unknown error occurred in the rutracker parser: %s" % e)
|
||||
return None
|
||||
|
||||
|
||||
def get_torrent_data(self, url):
|
||||
"""
|
||||
return the .torrent data
|
||||
"""
|
||||
|
||||
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
|
||||
downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
|
||||
cookie = {'bb_dl': torrent_id}
|
||||
try:
|
||||
headers = {'Referer': url}
|
||||
r = self.session.get(url=downloadurl, cookies=cookie, headers=headers, timeout=self.timeout)
|
||||
return r.content
|
||||
except Exception as e:
|
||||
logger.error('Error getting torrent: %s', e)
|
||||
return False
|
||||
|
||||
|
||||
#TODO get this working in utorrent.py
|
||||
def utorrent_add_file(self, data):
|
||||
|
||||
host = headphones.CONFIG.UTORRENT_HOST
|
||||
if not host.startswith('http'):
|
||||
host = 'http://' + host
|
||||
if host.endswith('/'):
|
||||
host = host[:-1]
|
||||
if host.endswith('/gui'):
|
||||
host = host[:-4]
|
||||
|
||||
base_url = host
|
||||
|
||||
url = base_url + '/gui/'
|
||||
self.session.auth = (headphones.CONFIG.UTORRENT_USERNAME, headphones.CONFIG.UTORRENT_PASSWORD)
|
||||
|
||||
try:
|
||||
r = self.session.get(url + 'token.html')
|
||||
except Exception as e:
|
||||
logger.error('Error getting token: %s', e)
|
||||
return
|
||||
|
||||
if r.status_code == 401:
|
||||
logger.debug('Error reaching utorrent')
|
||||
return
|
||||
|
||||
regex = re.search(r'.+>([^<]+)</div></html>', r.text)
|
||||
if regex is None:
|
||||
logger.debug('Error reading token')
|
||||
return
|
||||
|
||||
self.session.params = {'token': regex.group(1)}
|
||||
files = {'torrent_file': ("", data)}
|
||||
|
||||
try:
|
||||
self.session.post(url, params={'action': 'add-file'}, files=files)
|
||||
except Exception as e:
|
||||
logger.exception('Error adding file to utorrent %s', e)
|
||||
|
||||
@@ -36,12 +36,10 @@ import unicodedata
|
||||
|
||||
from headphones.common import USER_AGENT
|
||||
from headphones import logger, db, helpers, classes, sab, nzbget, request
|
||||
from headphones import utorrent, transmission, notifiers
|
||||
from headphones import utorrent, transmission, notifiers, rutracker
|
||||
|
||||
from bencode import bencode, bdecode
|
||||
|
||||
import headphones.searcher_rutracker as rutrackersearch
|
||||
|
||||
# Magnet to torrent services, for Black hole. Stolen from CouchPotato.
|
||||
TORRENT_TO_MAGNET_SERVICES = [
|
||||
'https://zoink.it/torrent/%s.torrent',
|
||||
@@ -51,9 +49,7 @@ TORRENT_TO_MAGNET_SERVICES = [
|
||||
|
||||
# Persistent What.cd API object
|
||||
gazelle = None
|
||||
|
||||
# RUtracker search object
|
||||
rutracker = rutrackersearch.Rutracker()
|
||||
ruobj = None
|
||||
|
||||
|
||||
def fix_url(s, charset="utf-8"):
|
||||
@@ -818,15 +814,9 @@ def send_to_downloader(data, bestqual, album):
|
||||
"to open or convert magnet links")
|
||||
return
|
||||
else:
|
||||
if bestqual[3] == "rutracker.org":
|
||||
download_path, _ = rutracker.get_torrent(bestqual[2],
|
||||
headphones.CONFIG.TORRENTBLACKHOLE_DIR)
|
||||
|
||||
if not download_path:
|
||||
return
|
||||
else:
|
||||
if not torrent_to_file(download_path, data):
|
||||
return
|
||||
if not torrent_to_file(download_path, data):
|
||||
return
|
||||
|
||||
# Extract folder name from torrent
|
||||
folder_name = read_torrent_name(download_path, bestqual[0])
|
||||
@@ -836,13 +826,11 @@ def send_to_downloader(data, bestqual, album):
|
||||
elif headphones.CONFIG.TORRENT_DOWNLOADER == 1:
|
||||
logger.info("Sending torrent to Transmission")
|
||||
|
||||
# rutracker needs cookies to be set, pass the .torrent file instead of url
|
||||
# Add torrent
|
||||
if bestqual[3] == 'rutracker.org':
|
||||
file_or_url, torrentid = rutracker.get_torrent(bestqual[2])
|
||||
torrentid = transmission.addTorrent('', data)
|
||||
else:
|
||||
file_or_url = bestqual[2]
|
||||
|
||||
torrentid = transmission.addTorrent(file_or_url)
|
||||
torrentid = transmission.addTorrent(bestqual[2])
|
||||
|
||||
if not torrentid:
|
||||
logger.error("Error sending torrent to Transmission. Are you sure it's running?")
|
||||
@@ -855,13 +843,6 @@ def send_to_downloader(data, bestqual, album):
|
||||
logger.error('Torrent folder name could not be determined')
|
||||
return
|
||||
|
||||
# remove temp .torrent file created above
|
||||
if bestqual[3] == 'rutracker.org':
|
||||
try:
|
||||
shutil.rmtree(os.path.split(file_or_url)[0])
|
||||
except Exception as e:
|
||||
logger.exception("Unhandled exception")
|
||||
|
||||
# Set Seed Ratio
|
||||
seed_ratio = get_seed_ratio(bestqual[3])
|
||||
if seed_ratio is not None:
|
||||
@@ -870,30 +851,30 @@ def send_to_downloader(data, bestqual, album):
|
||||
else:# if headphones.CONFIG.TORRENT_DOWNLOADER == 2:
|
||||
logger.info("Sending torrent to uTorrent")
|
||||
|
||||
# rutracker needs cookies to be set, pass the .torrent file instead of url
|
||||
# Add torrent
|
||||
if bestqual[3] == 'rutracker.org':
|
||||
file_or_url, torrentid = rutracker.get_torrent(bestqual[2])
|
||||
folder_name, cacheid = utorrent.dirTorrent(torrentid)
|
||||
folder_name = os.path.basename(os.path.normpath(folder_name))
|
||||
utorrent.labelTorrent(torrentid)
|
||||
ruobj.utorrent_add_file(data)
|
||||
else:
|
||||
file_or_url = bestqual[2]
|
||||
torrentid = calculate_torrent_hash(file_or_url, data)
|
||||
folder_name = utorrent.addTorrent(file_or_url, torrentid)
|
||||
utorrent.addTorrent(bestqual[2])
|
||||
|
||||
# Get hash
|
||||
torrentid = calculate_torrent_hash(bestqual[2], data)
|
||||
if not torrentid:
|
||||
logger.error('Torrent id could not be determined')
|
||||
return
|
||||
|
||||
# Set Label
|
||||
if headphones.CONFIG.UTORRENT_LABEL:
|
||||
utorrent.labelTorrent(torrentid)
|
||||
|
||||
# Get folder
|
||||
folder_name = utorrent.getFolder(torrentid)
|
||||
if folder_name:
|
||||
logger.info('Torrent folder name: %s' % folder_name)
|
||||
else:
|
||||
logger.error('Torrent folder name could not be determined')
|
||||
return
|
||||
|
||||
# remove temp .torrent file created above
|
||||
if bestqual[3] == 'rutracker.org':
|
||||
try:
|
||||
shutil.rmtree(os.path.split(file_or_url)[0])
|
||||
except Exception as e:
|
||||
logger.exception("Unhandled exception")
|
||||
|
||||
# Set Seed Ratio
|
||||
seed_ratio = get_seed_ratio(bestqual[3])
|
||||
if seed_ratio is not None:
|
||||
@@ -1041,12 +1022,7 @@ def verifyresult(title, artistterm, term, lossless):
|
||||
|
||||
def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose_specific_download=False):
|
||||
global gazelle # persistent what.cd api object to reduce number of login attempts
|
||||
|
||||
# rutracker login
|
||||
if headphones.CONFIG.RUTRACKER and album:
|
||||
rulogin = rutracker.login(headphones.CONFIG.RUTRACKER_USER, headphones.CONFIG.RUTRACKER_PASSWORD)
|
||||
if not rulogin:
|
||||
logger.info(u'Could not login to rutracker, search results will exclude this provider')
|
||||
global ruobj # and rutracker
|
||||
|
||||
albumid = album['AlbumID']
|
||||
reldate = album['ReleaseDate']
|
||||
@@ -1239,45 +1215,38 @@ def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose
|
||||
logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s", e)
|
||||
|
||||
# rutracker.org
|
||||
if headphones.CONFIG.RUTRACKER and rulogin:
|
||||
if headphones.CONFIG.RUTRACKER:
|
||||
provider = "rutracker.org"
|
||||
|
||||
# Ignore if release date not specified, results too unpredictable
|
||||
if not year and not usersearchterm:
|
||||
logger.info(u'Release date not specified, ignoring for rutracker.org')
|
||||
logger.info(u"Release date not specified, ignoring for rutracker.org")
|
||||
else:
|
||||
|
||||
if headphones.CONFIG.PREFERRED_QUALITY == 3 or losslessOnly:
|
||||
format = 'lossless'
|
||||
maxsize = 10000000000
|
||||
elif headphones.CONFIG.PREFERRED_QUALITY == 1 or allow_lossless:
|
||||
format = 'lossless+mp3'
|
||||
maxsize = 10000000000
|
||||
else:
|
||||
format = 'mp3'
|
||||
maxsize = 300000000
|
||||
|
||||
# build search url based on above
|
||||
if not usersearchterm:
|
||||
searchURL = rutracker.searchurl(artistterm, albumterm, year, format)
|
||||
else:
|
||||
searchURL = rutracker.searchurl(usersearchterm, ' ', ' ', format)
|
||||
# Login
|
||||
if not ruobj or not ruobj.logged_in():
|
||||
ruobj = rutracker.Rutracker()
|
||||
if not ruobj.login():
|
||||
ruobj = None
|
||||
|
||||
logger.info(u'Parsing results from <a href="%s">rutracker.org</a>' % searchURL)
|
||||
if ruobj and ruobj.logged_in():
|
||||
|
||||
# parse results and get best match
|
||||
rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid)
|
||||
# build search url
|
||||
if not usersearchterm:
|
||||
searchURL = ruobj.searchurl(artistterm, albumterm, year, format)
|
||||
else:
|
||||
searchURL = ruobj.searchurl(usersearchterm, ' ', ' ', format)
|
||||
|
||||
# add best match to overall results list
|
||||
if rulist:
|
||||
for ru in rulist:
|
||||
title = ru[0].decode('utf-8')
|
||||
size = ru[1]
|
||||
url = ru[2]
|
||||
resultlist.append((title, size, url, provider, 'torrent', True))
|
||||
logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size)))
|
||||
else:
|
||||
logger.info(u"No valid results found from %s" % (provider))
|
||||
# parse results
|
||||
rulist = ruobj.search(searchURL)
|
||||
if rulist:
|
||||
resultlist.extend(rulist)
|
||||
|
||||
if headphones.CONFIG.WHATCD:
|
||||
provider = "What.cd"
|
||||
@@ -1567,12 +1536,14 @@ def preprocess(resultlist):
|
||||
|
||||
for result in resultlist:
|
||||
if result[4] == 'torrent':
|
||||
|
||||
# rutracker always needs the torrent data
|
||||
if result[3] == 'rutracker.org':
|
||||
return ruobj.get_torrent_data(result[2]), result
|
||||
|
||||
#Get out of here if we're using Transmission
|
||||
if headphones.CONFIG.TORRENT_DOWNLOADER == 1: ## if not a magnet link still need the .torrent to generate hash... uTorrent support labeling
|
||||
return True, result
|
||||
# get outta here if rutracker
|
||||
if result[3] == 'rutracker.org':
|
||||
return True, result
|
||||
# Get out of here if it's a magnet link
|
||||
if result[2].lower().startswith("magnet:"):
|
||||
return True, result
|
||||
|
||||
@@ -1,349 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
|
||||
# Headphones rutracker.org search
|
||||
# Functions called from searcher.py
|
||||
|
||||
from bencode import bencode as bencode, bdecode
|
||||
from urlparse import urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from tempfile import mkdtemp
|
||||
from hashlib import sha1
|
||||
|
||||
import headphones
|
||||
import requests
|
||||
import cookielib
|
||||
import urllib2
|
||||
import urllib
|
||||
import re
|
||||
import os
|
||||
|
||||
from headphones import db, logger
|
||||
|
||||
|
||||
class Rutracker():
|
||||
|
||||
logged_in = False
|
||||
|
||||
# Stores a number of login attempts to prevent recursion.
|
||||
#login_counter = 0
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.cookiejar = cookielib.CookieJar()
|
||||
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar))
|
||||
urllib2.install_opener(self.opener)
|
||||
|
||||
def login(self, login, password):
|
||||
"""Implements tracker login procedure."""
|
||||
|
||||
self.logged_in = False
|
||||
|
||||
if login is None or password is None:
|
||||
return False
|
||||
|
||||
#self.login_counter += 1
|
||||
|
||||
# No recursion wanted.
|
||||
#if self.login_counter > 1:
|
||||
# return False
|
||||
|
||||
params = urllib.urlencode({"login_username": login,
|
||||
"login_password": password,
|
||||
"login": "Вход"})
|
||||
|
||||
try:
|
||||
self.opener.open("http://login.rutracker.org/forum/login.php", params)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check if we're logged in
|
||||
for cookie in self.cookiejar:
|
||||
if cookie.name == 'bb_data':
|
||||
self.logged_in = True
|
||||
|
||||
return self.logged_in
|
||||
|
||||
def searchurl(self, artist, album, year, format):
|
||||
"""
|
||||
Return the search url
|
||||
"""
|
||||
|
||||
# Build search url
|
||||
searchterm = ''
|
||||
if artist != 'Various Artists':
|
||||
searchterm = artist
|
||||
searchterm = searchterm + ' '
|
||||
searchterm = searchterm + album
|
||||
searchterm = searchterm + ' '
|
||||
searchterm = searchterm + year
|
||||
|
||||
providerurl = "http://rutracker.org/forum/tracker.php"
|
||||
|
||||
if format == 'lossless':
|
||||
format = '+lossless'
|
||||
elif format == 'lossless+mp3':
|
||||
format = '+lossless||mp3||aac'
|
||||
else:
|
||||
format = '+mp3||aac'
|
||||
|
||||
# sort by size, descending.
|
||||
sort = '&o=7&s=2'
|
||||
|
||||
searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort)
|
||||
|
||||
return searchurl
|
||||
|
||||
def search(self, searchurl, maxsize, minseeders, albumid):
|
||||
"""
|
||||
Parse the search results and return valid torrent list
|
||||
"""
|
||||
|
||||
titles = []
|
||||
urls = []
|
||||
seeders = []
|
||||
sizes = []
|
||||
torrentlist = []
|
||||
rulist = []
|
||||
|
||||
try:
|
||||
|
||||
page = self.opener.open(searchurl, timeout=60)
|
||||
soup = BeautifulSoup(page.read())
|
||||
|
||||
# Debug
|
||||
#logger.debug (soup.prettify())
|
||||
|
||||
# Title
|
||||
for link in soup.find_all('a', attrs={'class': 'med tLink hl-tags bold'}):
|
||||
title = link.get_text()
|
||||
titles.append(title)
|
||||
|
||||
# Download URL
|
||||
for link in soup.find_all('a', attrs={'class': 'small tr-dl dl-stub'}):
|
||||
url = link.get('href')
|
||||
urls.append(url)
|
||||
|
||||
# Seeders
|
||||
for link in soup.find_all('b', attrs={'class': 'seedmed'}):
|
||||
seeder = link.get_text()
|
||||
seeders.append(seeder)
|
||||
|
||||
# Size
|
||||
for link in soup.find_all('td', attrs={'class': 'row4 small nowrap tor-size'}):
|
||||
size = link.u.string
|
||||
sizes.append(size)
|
||||
|
||||
except:
|
||||
pass
|
||||
|
||||
# Combine lists
|
||||
torrentlist = zip(titles, urls, seeders, sizes)
|
||||
|
||||
# return if nothing found
|
||||
if not torrentlist:
|
||||
return False
|
||||
|
||||
# don't bother checking track counts anymore, let searcher filter instead
|
||||
# leave code in just in case
|
||||
check_track_count = False
|
||||
|
||||
if check_track_count:
|
||||
|
||||
# get headphones track count for album, return if not found
|
||||
myDB = db.DBConnection()
|
||||
tracks = myDB.select('SELECT * from tracks WHERE AlbumID=?', [albumid])
|
||||
hptrackcount = len(tracks)
|
||||
|
||||
if not hptrackcount:
|
||||
logger.info('headphones track info not found, cannot compare to torrent')
|
||||
return False
|
||||
|
||||
# Return all valid entries, ignored, required words now checked in searcher.py
|
||||
|
||||
#unwantedlist = ['promo', 'vinyl', '[lp]', 'songbook', 'tvrip', 'hdtv', 'dvd']
|
||||
|
||||
formatlist = ['ape', 'flac', 'ogg', 'm4a', 'aac', 'mp3', 'wav', 'aif']
|
||||
deluxelist = ['deluxe', 'edition', 'japanese', 'exclusive']
|
||||
|
||||
for torrent in torrentlist:
|
||||
|
||||
returntitle = torrent[0].encode('utf-8')
|
||||
url = torrent[1]
|
||||
seeders = torrent[2]
|
||||
size = torrent[3]
|
||||
|
||||
if int(size) <= maxsize and int(seeders) >= minseeders:
|
||||
|
||||
#Torrent topic page
|
||||
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
|
||||
topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id
|
||||
|
||||
# add to list
|
||||
if not check_track_count:
|
||||
valid = True
|
||||
else:
|
||||
|
||||
# Check torrent info
|
||||
self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
|
||||
|
||||
# Debug
|
||||
#for cookie in self.cookiejar:
|
||||
# logger.debug ('Cookie: %s' % cookie)
|
||||
|
||||
try:
|
||||
page = self.opener.open(url)
|
||||
torrent = page.read()
|
||||
if torrent:
|
||||
decoded = bdecode(torrent)
|
||||
metainfo = decoded['info']
|
||||
page.close()
|
||||
except Exception as e:
|
||||
logger.error('Error getting torrent: %s' % e)
|
||||
return False
|
||||
|
||||
# get torrent track count and check for cue
|
||||
trackcount = 0
|
||||
cuecount = 0
|
||||
|
||||
if 'files' in metainfo: # multi
|
||||
for pathfile in metainfo['files']:
|
||||
path = pathfile['path']
|
||||
for file in path:
|
||||
if any(file.lower().endswith('.' + x.lower()) for x in formatlist):
|
||||
trackcount += 1
|
||||
if '.cue' in file:
|
||||
cuecount += 1
|
||||
|
||||
title = returntitle.lower()
|
||||
logger.debug('torrent title: %s' % title)
|
||||
logger.debug('headphones trackcount: %s' % hptrackcount)
|
||||
logger.debug('rutracker trackcount: %s' % trackcount)
|
||||
|
||||
# If torrent track count less than headphones track count, and there's a cue, then attempt to get track count from log(s)
|
||||
# This is for the case where we have a single .flac/.wav which can be split by cue
|
||||
# Not great, but shouldn't be doing this too often
|
||||
totallogcount = 0
|
||||
if trackcount < hptrackcount and cuecount > 0 and cuecount < hptrackcount:
|
||||
page = self.opener.open(topicurl, timeout=60)
|
||||
soup = BeautifulSoup(page.read())
|
||||
findtoc = soup.find_all(text='TOC of the extracted CD')
|
||||
if not findtoc:
|
||||
findtoc = soup.find_all(text='TOC извлечённого CD')
|
||||
for toc in findtoc:
|
||||
logcount = 0
|
||||
for toccontent in toc.find_all_next(text=True):
|
||||
cut_string = toccontent.split('|')
|
||||
new_string = cut_string[0].lstrip().rstrip()
|
||||
if new_string == '1' or new_string == '01':
|
||||
logcount = 1
|
||||
elif logcount > 0:
|
||||
if new_string.isdigit():
|
||||
logcount += 1
|
||||
else:
|
||||
break
|
||||
totallogcount = totallogcount + logcount
|
||||
|
||||
if totallogcount > 0:
|
||||
trackcount = totallogcount
|
||||
logger.debug('rutracker logtrackcount: %s' % totallogcount)
|
||||
|
||||
# If torrent track count = hp track count then return torrent,
|
||||
# if greater, check for deluxe/special/foreign editions
|
||||
# if less, then allow if it's a single track with a cue
|
||||
valid = False
|
||||
|
||||
if trackcount == hptrackcount:
|
||||
valid = True
|
||||
elif trackcount > hptrackcount:
|
||||
if any(deluxe in title for deluxe in deluxelist):
|
||||
valid = True
|
||||
|
||||
# Add to list
|
||||
if valid:
|
||||
rulist.append((returntitle, size, topicurl))
|
||||
else:
|
||||
if topicurl:
|
||||
logger.info(u'<a href="%s">Torrent</a> found with %s tracks but the selected headphones release has %s tracks, skipping for rutracker.org' % (topicurl, trackcount, hptrackcount))
|
||||
else:
|
||||
logger.info('%s is larger than the maxsize or has too little seeders for this category, skipping. (Size: %i bytes, Seeders: %i)' % (returntitle, int(size), int(seeders)))
|
||||
|
||||
return rulist
|
||||
|
||||
def get_torrent(self, url, savelocation=None):
|
||||
|
||||
torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t']
|
||||
self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False))
|
||||
downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id
|
||||
torrent_name = torrent_id + '.torrent'
|
||||
|
||||
try:
|
||||
prev = os.umask(headphones.UMASK)
|
||||
page = self.opener.open(downloadurl)
|
||||
torrent = page.read()
|
||||
decoded = bdecode(torrent)
|
||||
metainfo = decoded['info']
|
||||
tor_hash = sha1(bencode(metainfo)).hexdigest()
|
||||
if savelocation:
|
||||
download_path = os.path.join(savelocation, torrent_name)
|
||||
else:
|
||||
tempdir = mkdtemp(suffix='_rutracker_torrents')
|
||||
download_path = os.path.join(tempdir, torrent_name)
|
||||
|
||||
with open(download_path, 'wb') as f:
|
||||
f.write(torrent)
|
||||
os.umask(prev)
|
||||
|
||||
# Add file to utorrent
|
||||
if headphones.CONFIG.TORRENT_DOWNLOADER == 2:
|
||||
self.utorrent_add_file(download_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error('Error getting torrent: %s', e)
|
||||
return False
|
||||
|
||||
return download_path, tor_hash
|
||||
|
||||
#TODO get this working in utorrent.py
|
||||
def utorrent_add_file(self, filename):
|
||||
|
||||
host = headphones.CONFIG.UTORRENT_HOST
|
||||
if not host.startswith('http'):
|
||||
host = 'http://' + host
|
||||
if host.endswith('/'):
|
||||
host = host[:-1]
|
||||
if host.endswith('/gui'):
|
||||
host = host[:-4]
|
||||
|
||||
base_url = host
|
||||
username = headphones.CONFIG.UTORRENT_USERNAME
|
||||
password = headphones.CONFIG.UTORRENT_PASSWORD
|
||||
|
||||
session = requests.Session()
|
||||
url = base_url + '/gui/'
|
||||
session.auth = (username, password)
|
||||
|
||||
try:
|
||||
r = session.get(url + 'token.html')
|
||||
except Exception:
|
||||
logger.exception('Error getting token')
|
||||
return
|
||||
|
||||
if r.status_code == '401':
|
||||
logger.debug('Error reaching utorrent')
|
||||
return
|
||||
|
||||
regex = re.search(r'.+>([^<]+)</div></html>', r.text)
|
||||
if regex is None:
|
||||
logger.debug('Error reading token')
|
||||
return
|
||||
|
||||
session.params = {'token': regex.group(1)}
|
||||
|
||||
with open(filename, 'rb') as f:
|
||||
try:
|
||||
session.post(url, params={'action': 'add-file'},
|
||||
files={'torrent_file': f})
|
||||
except Exception:
|
||||
logger.exception('Error adding file to utorrent')
|
||||
return
|
||||
@@ -28,12 +28,15 @@ import headphones
|
||||
# Store torrent id so we can check up on it
|
||||
|
||||
|
||||
def addTorrent(link):
|
||||
def addTorrent(link, data=None):
|
||||
method = 'torrent-add'
|
||||
|
||||
if link.endswith('.torrent'):
|
||||
with open(link, 'rb') as f:
|
||||
metainfo = str(base64.b64encode(f.read()))
|
||||
if link.endswith('.torrent') or data:
|
||||
if data:
|
||||
metainfo = str(base64.b64encode(data))
|
||||
else:
|
||||
with open(link, 'rb') as f:
|
||||
metainfo = str(base64.b64encode(f.read()))
|
||||
arguments = {'metainfo': metainfo, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR}
|
||||
else:
|
||||
arguments = {'filename': link, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR}
|
||||
|
||||
@@ -220,7 +220,7 @@ def dirTorrent(hash, cacheid=None, return_name=None):
|
||||
cacheid = torrentList['torrentc']
|
||||
|
||||
for torrent in torrents:
|
||||
if torrent[0].upper() == hash:
|
||||
if torrent[0].upper() == hash.upper():
|
||||
if not return_name:
|
||||
return torrent[26], cacheid
|
||||
else:
|
||||
@@ -228,8 +228,12 @@ def dirTorrent(hash, cacheid=None, return_name=None):
|
||||
|
||||
return None, None
|
||||
|
||||
def addTorrent(link):
|
||||
uTorrentClient = utorrentclient()
|
||||
uTorrentClient.add_url(link)
|
||||
|
||||
def addTorrent(link, hash):
|
||||
|
||||
def getFolder(hash):
|
||||
uTorrentClient = utorrentclient()
|
||||
|
||||
# Get Active Directory from settings
|
||||
@@ -239,8 +243,6 @@ def addTorrent(link, hash):
|
||||
logger.error('Could not get "Put new downloads in:" directory from uTorrent settings, please ensure it is set')
|
||||
return None
|
||||
|
||||
uTorrentClient.add_url(link)
|
||||
|
||||
# Get Torrent Folder Name
|
||||
torrent_folder, cacheid = dirTorrent(hash)
|
||||
|
||||
@@ -254,10 +256,8 @@ def addTorrent(link, hash):
|
||||
|
||||
if torrent_folder == active_dir or not torrent_folder:
|
||||
torrent_folder, cacheid = dirTorrent(hash, cacheid, return_name=True)
|
||||
labelTorrent(hash)
|
||||
return torrent_folder
|
||||
else:
|
||||
labelTorrent(hash)
|
||||
if headphones.SYS_PLATFORM != "win32":
|
||||
torrent_folder = torrent_folder.replace('\\', '/')
|
||||
return os.path.basename(os.path.normpath(torrent_folder))
|
||||
|
||||
@@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.3.2"
|
||||
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
|
||||
__version__ = "4.4.0"
|
||||
__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
@@ -45,7 +45,7 @@ from .element import (
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
@@ -77,8 +77,11 @@ class BeautifulSoup(Tag):
|
||||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, **kwargs):
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
**kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
@@ -114,9 +117,9 @@ class BeautifulSoup(Tag):
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. You can pass in features='html' "
|
||||
"or features='xml' to get a builder capable of handling "
|
||||
"one or the other.")
|
||||
"BeautifulSoup constructor. Suggest you use "
|
||||
"features='lxml' for HTML and features='lxml-xml' for "
|
||||
"XML.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
@@ -140,6 +143,7 @@ class BeautifulSoup(Tag):
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
original_features = features
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
@@ -151,6 +155,16 @@ class BeautifulSoup(Tag):
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
if not (original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES):
|
||||
if builder.is_xml:
|
||||
markup_type = "XML"
|
||||
else:
|
||||
markup_type = "HTML"
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||
parser=builder.NAME,
|
||||
markup_type=markup_type))
|
||||
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.builder.soup = self
|
||||
@@ -178,6 +192,8 @@ class BeautifulSoup(Tag):
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
if isinstance(markup, unicode):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
|
||||
if markup[:5] == "http:" or markup[:6] == "https:":
|
||||
@@ -185,12 +201,15 @@ class BeautifulSoup(Tag):
|
||||
# Python 3 otherwise.
|
||||
if ((isinstance(markup, bytes) and not b' ' in markup)
|
||||
or (isinstance(markup, unicode) and not u' ' in markup)):
|
||||
if isinstance(markup, unicode):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
|
||||
|
||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) in (
|
||||
self.builder.prepare_markup(markup, from_encoding)):
|
||||
self.builder.prepare_markup(
|
||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||
self.reset()
|
||||
try:
|
||||
self._feed()
|
||||
@@ -203,6 +222,16 @@ class BeautifulSoup(Tag):
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def __copy__(self):
|
||||
return type(self)(self.encode(), builder=self.builder)
|
||||
|
||||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and not self.builder.picklable:
|
||||
del d['builder']
|
||||
return d
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
@@ -229,9 +258,7 @@ class BeautifulSoup(Tag):
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
navigable = subclass(s)
|
||||
navigable.setup()
|
||||
return navigable
|
||||
return subclass(s)
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
@@ -290,14 +317,49 @@ class BeautifulSoup(Tag):
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
most_recent_element = most_recent_element or self._most_recent_element
|
||||
o.setup(parent, most_recent_element)
|
||||
previous_element = most_recent_element or self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if not previous_element:
|
||||
previous_element = o.previous_element
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
if most_recent_element is not None:
|
||||
most_recent_element.next_element = o
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
if parent.next_sibling:
|
||||
# This node is being inserted into an element that has
|
||||
# already been parsed. Deal with any dangling references.
|
||||
index = parent.contents.index(o)
|
||||
if index == 0:
|
||||
previous_element = parent
|
||||
previous_sibling = None
|
||||
else:
|
||||
previous_element = previous_sibling = parent.contents[index-1]
|
||||
if index == len(parent.contents)-1:
|
||||
next_element = parent.next_sibling
|
||||
next_sibling = None
|
||||
else:
|
||||
next_element = next_sibling = parent.contents[index+1]
|
||||
|
||||
o.previous_element = previous_element
|
||||
if previous_element:
|
||||
previous_element.next_element = o
|
||||
o.next_element = next_element
|
||||
if next_element:
|
||||
next_element.previous_element = o
|
||||
o.next_sibling = next_sibling
|
||||
if next_sibling:
|
||||
next_sibling.previous_sibling = o
|
||||
o.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
previous_sibling.next_sibling = o
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
|
||||
@@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
NAME = "[Unknown tree builder]"
|
||||
ALTERNATE_NAMES = []
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
picklable = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
@@ -2,6 +2,7 @@ __all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
from pdb import set_trace
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
@@ -9,7 +10,10 @@ from bs4.builder import (
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import NamespacedAttribute
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
whitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import namespaces
|
||||
from bs4.element import (
|
||||
@@ -22,11 +26,20 @@ from bs4.element import (
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
|
||||
NAME = "html5lib"
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding):
|
||||
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
|
||||
# document_declared_encoding and exclude_encodings aren't used
|
||||
# ATM because the html5lib TreeBuilder doesn't use
|
||||
# UnicodeDammit.
|
||||
if exclude_encodings:
|
||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
@@ -101,7 +114,13 @@ class AttrList(object):
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = HTML5TreeBuilder.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
value = whitespace_re.split(value)
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
@@ -161,6 +180,12 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
# immediately after the parent, if it has no children.)
|
||||
if self.element.contents:
|
||||
most_recent_element = self.element._last_descendant(False)
|
||||
elif self.element.next_element is not None:
|
||||
# Something from further ahead in the parse tree is
|
||||
# being inserted into this earlier element. This is
|
||||
# very annoying because it means an expensive search
|
||||
# for the last element in the tree.
|
||||
most_recent_element = self.soup._last_descendant()
|
||||
else:
|
||||
most_recent_element = self.element
|
||||
|
||||
@@ -172,6 +197,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
@@ -218,6 +244,9 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
|
||||
def reparentChildren(self, new_parent):
|
||||
"""Move all of this tag's children into another tag."""
|
||||
# print "MOVE", self.element.contents
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent.element
|
||||
element = self.element
|
||||
new_parent_element = new_parent.element
|
||||
# Determine what this tag's next_element will be once all the children
|
||||
@@ -236,17 +265,28 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||
|
||||
to_append = element.contents
|
||||
append_after = new_parent.element.contents
|
||||
append_after = new_parent_element.contents
|
||||
if len(to_append) > 0:
|
||||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
if new_parents_last_descendant:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Fix the last child's next_element and next_sibling
|
||||
last_child = to_append[-1]
|
||||
last_child.next_element = new_parents_last_descendant_next_element
|
||||
if new_parents_last_descendant_next_element:
|
||||
new_parents_last_descendant_next_element.previous_element = last_child
|
||||
last_child.next_sibling = None
|
||||
|
||||
for child in to_append:
|
||||
@@ -257,6 +297,10 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
element.contents = []
|
||||
element.next_element = final_next_element
|
||||
|
||||
# print "DONE WITH MOVE"
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent_element
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
@@ -268,7 +312,7 @@ class Element(html5lib.treebuilders._base.Node):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace is None:
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
@@ -4,10 +4,16 @@ __all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import (
|
||||
HTMLParser,
|
||||
HTMLParseError,
|
||||
)
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
try:
|
||||
from HTMLParser import HTMLParseError
|
||||
except ImportError, e:
|
||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||
class HTMLParseError(Exception):
|
||||
pass
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
@@ -19,10 +25,10 @@ import warnings
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = (
|
||||
major > 3
|
||||
or (major == 3 and minor > 2)
|
||||
or (major == 3 and minor == 2 and release >= 3))
|
||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
@@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed.
|
||||
# it's fixed in all supported versions.
|
||||
# http://bugs.python.org/issue13633
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
elif name.startswith('X'):
|
||||
@@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
if data.endswith("?") and data.lower().startswith("xml"):
|
||||
# "An XHTML processing instruction using the trailing '?'
|
||||
# will cause the '?' to be included in data." - HTMLParser
|
||||
# docs.
|
||||
#
|
||||
# Strip the question mark so we don't end up with two
|
||||
# question marks.
|
||||
data = data[:-1]
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
@@ -128,15 +127,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
features = [HTML, STRICT, HTMLPARSER]
|
||||
picklable = True
|
||||
NAME = HTMLPARSER
|
||||
features = [NAME, HTML, STRICT]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT:
|
||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||
kwargs['strict'] = False
|
||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||
kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
@@ -147,7 +150,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
return
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||
exclude_encodings=exclude_encodings)
|
||||
yield (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
@@ -7,7 +7,12 @@ from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import Comment, Doctype, NamespacedAttribute
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NamespacedAttribute,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
@@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
|
||||
is_xml = True
|
||||
|
||||
NAME = "lxml-xml"
|
||||
ALTERNATE_NAMES = ["xml"]
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [LXML, XML, FAST, PERMISSIVE]
|
||||
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
@@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
exclude_encodings=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:yield: A series of 4-tuples.
|
||||
@@ -95,7 +104,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
# the document as each one in turn.
|
||||
is_html = not self.is_xml
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
detector = EncodingDetector(markup, try_encodings, is_html)
|
||||
detector = EncodingDetector(
|
||||
markup, try_encodings, is_html, exclude_encodings)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
@@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
self.nsmaps.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
pass
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(target + ' ' + data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
@@ -212,7 +224,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
features = [LXML, HTML, FAST, PERMISSIVE]
|
||||
NAME = LXML
|
||||
ALTERNATE_NAMES = ["lxml-html"]
|
||||
|
||||
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
|
||||
def default_parser(self, encoding):
|
||||
|
||||
@@ -3,10 +3,11 @@
|
||||
|
||||
This library converts a bytestream to Unicode through any means
|
||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It works best on XML and XML, but it does not rewrite the
|
||||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
|
||||
from pdb import set_trace
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
@@ -212,8 +213,11 @@ class EncodingDetector:
|
||||
|
||||
5. Windows-1252.
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False):
|
||||
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||
exclude_encodings=None):
|
||||
self.override_encodings = override_encodings or []
|
||||
exclude_encodings = exclude_encodings or []
|
||||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||
self.chardet_encoding = None
|
||||
self.is_html = is_html
|
||||
self.declared_encoding = None
|
||||
@@ -224,6 +228,8 @@ class EncodingDetector:
|
||||
def _usable(self, encoding, tried):
|
||||
if encoding is not None:
|
||||
encoding = encoding.lower()
|
||||
if encoding in self.exclude_encodings:
|
||||
return False
|
||||
if encoding not in tried:
|
||||
tried.add(encoding)
|
||||
return True
|
||||
@@ -266,6 +272,9 @@ class EncodingDetector:
|
||||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
encoding = None
|
||||
if isinstance(data, unicode):
|
||||
# Unicode data cannot have a byte-order mark.
|
||||
return data, encoding
|
||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
@@ -299,14 +308,14 @@ class EncodingDetector:
|
||||
else:
|
||||
xml_endpos = 1024
|
||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||
|
||||
|
||||
declared_encoding = None
|
||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||
if not declared_encoding_match and is_html:
|
||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||
if declared_encoding_match is not None:
|
||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||
'ascii')
|
||||
'ascii', 'replace')
|
||||
if declared_encoding:
|
||||
return declared_encoding.lower()
|
||||
return None
|
||||
@@ -331,13 +340,14 @@ class UnicodeDammit:
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False):
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
self.is_html = is_html
|
||||
|
||||
self.detector = EncodingDetector(markup, override_encodings, is_html)
|
||||
self.detector = EncodingDetector(
|
||||
markup, override_encodings, is_html, exclude_encodings)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, unicode) or markup == '':
|
||||
|
||||
@@ -33,12 +33,21 @@ def diagnose(data):
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append(["lxml", "xml"])
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
try:
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
except ImportError, e:
|
||||
print (
|
||||
"lxml is not installed or couldn't be imported.")
|
||||
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
try:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
except ImportError, e:
|
||||
print (
|
||||
"html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
@@ -135,7 +144,7 @@ def rword(length=5):
|
||||
def rsentence(length=4):
|
||||
"Generate a random sentence-like string."
|
||||
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
||||
|
||||
|
||||
def rdoc(num_elements=1000):
|
||||
"""Randomly generate an invalid HTML document."""
|
||||
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
|
||||
@@ -159,7 +168,7 @@ def benchmark_parsers(num_elements=100000):
|
||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||
data = rdoc(num_elements)
|
||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
try:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from pdb import set_trace
|
||||
import collections
|
||||
import re
|
||||
import sys
|
||||
@@ -185,24 +186,40 @@ class PageElement(object):
|
||||
return self.HTML_FORMATTERS.get(
|
||||
name, HTMLAwareEntitySubstitution.substitute_xml)
|
||||
|
||||
def setup(self, parent=None, previous_element=None):
|
||||
def setup(self, parent=None, previous_element=None, next_element=None,
|
||||
previous_sibling=None, next_sibling=None):
|
||||
"""Sets up the initial relations between this element and
|
||||
other elements."""
|
||||
self.parent = parent
|
||||
|
||||
self.previous_element = previous_element
|
||||
if previous_element is not None:
|
||||
self.previous_element.next_element = self
|
||||
self.next_element = None
|
||||
self.previous_sibling = None
|
||||
self.next_sibling = None
|
||||
if self.parent is not None and self.parent.contents:
|
||||
self.previous_sibling = self.parent.contents[-1]
|
||||
|
||||
self.next_element = next_element
|
||||
if self.next_element:
|
||||
self.next_element.previous_element = self
|
||||
|
||||
self.next_sibling = next_sibling
|
||||
if self.next_sibling:
|
||||
self.next_sibling.previous_sibling = self
|
||||
|
||||
if (not previous_sibling
|
||||
and self.parent is not None and self.parent.contents):
|
||||
previous_sibling = self.parent.contents[-1]
|
||||
|
||||
self.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
self.previous_sibling.next_sibling = self
|
||||
|
||||
nextSibling = _alias("next_sibling") # BS3
|
||||
previousSibling = _alias("previous_sibling") # BS3
|
||||
|
||||
def replace_with(self, replace_with):
|
||||
if not self.parent:
|
||||
raise ValueError(
|
||||
"Cannot replace one element with another when the"
|
||||
"element to be replaced is not part of a tree.")
|
||||
if replace_with is self:
|
||||
return
|
||||
if replace_with is self.parent:
|
||||
@@ -216,6 +233,10 @@ class PageElement(object):
|
||||
|
||||
def unwrap(self):
|
||||
my_parent = self.parent
|
||||
if not self.parent:
|
||||
raise ValueError(
|
||||
"Cannot replace an element with its contents when that"
|
||||
"element is not part of a tree.")
|
||||
my_index = self.parent.index(self)
|
||||
self.extract()
|
||||
for child in reversed(self.contents[:]):
|
||||
@@ -240,17 +261,20 @@ class PageElement(object):
|
||||
last_child = self._last_descendant()
|
||||
next_element = last_child.next_element
|
||||
|
||||
if self.previous_element is not None:
|
||||
if (self.previous_element is not None and
|
||||
self.previous_element != next_element):
|
||||
self.previous_element.next_element = next_element
|
||||
if next_element is not None:
|
||||
if next_element is not None and next_element != self.previous_element:
|
||||
next_element.previous_element = self.previous_element
|
||||
self.previous_element = None
|
||||
last_child.next_element = None
|
||||
|
||||
self.parent = None
|
||||
if self.previous_sibling is not None:
|
||||
if (self.previous_sibling is not None
|
||||
and self.previous_sibling != self.next_sibling):
|
||||
self.previous_sibling.next_sibling = self.next_sibling
|
||||
if self.next_sibling is not None:
|
||||
if (self.next_sibling is not None
|
||||
and self.next_sibling != self.previous_sibling):
|
||||
self.next_sibling.previous_sibling = self.previous_sibling
|
||||
self.previous_sibling = self.next_sibling = None
|
||||
return self
|
||||
@@ -478,6 +502,10 @@ class PageElement(object):
|
||||
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
|
||||
"Iterates over a generator looking for things that match."
|
||||
|
||||
if text is None and 'string' in kwargs:
|
||||
text = kwargs['string']
|
||||
del kwargs['string']
|
||||
|
||||
if isinstance(name, SoupStrainer):
|
||||
strainer = name
|
||||
else:
|
||||
@@ -548,17 +576,17 @@ class PageElement(object):
|
||||
|
||||
# Methods for supporting CSS selectors.
|
||||
|
||||
tag_name_re = re.compile('^[a-z0-9]+$')
|
||||
tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
|
||||
|
||||
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
|
||||
# \---/ \---/\-------------/ \-------/
|
||||
# | | | |
|
||||
# | | | The value
|
||||
# | | ~,|,^,$,* or =
|
||||
# | Attribute
|
||||
# /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
|
||||
# \---------------------------/ \---/\-------------/ \-------/
|
||||
# | | | |
|
||||
# | | | The value
|
||||
# | | ~,|,^,$,* or =
|
||||
# | Attribute
|
||||
# Tag
|
||||
attribselect_re = re.compile(
|
||||
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
|
||||
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
|
||||
r'=?"?(?P<value>[^\]"]*)"?\]$'
|
||||
)
|
||||
|
||||
@@ -654,11 +682,17 @@ class NavigableString(unicode, PageElement):
|
||||
how to handle non-ASCII characters.
|
||||
"""
|
||||
if isinstance(value, unicode):
|
||||
return unicode.__new__(cls, value)
|
||||
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||
u = unicode.__new__(cls, value)
|
||||
else:
|
||||
u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||
u.setup()
|
||||
return u
|
||||
|
||||
def __copy__(self):
|
||||
return self
|
||||
"""A copy of a NavigableString has the same contents and class
|
||||
as the original, but it is not connected to the parse tree.
|
||||
"""
|
||||
return type(self)(self)
|
||||
|
||||
def __getnewargs__(self):
|
||||
return (unicode(self),)
|
||||
@@ -707,7 +741,7 @@ class CData(PreformattedString):
|
||||
class ProcessingInstruction(PreformattedString):
|
||||
|
||||
PREFIX = u'<?'
|
||||
SUFFIX = u'?>'
|
||||
SUFFIX = u'>'
|
||||
|
||||
class Comment(PreformattedString):
|
||||
|
||||
@@ -759,9 +793,12 @@ class Tag(PageElement):
|
||||
self.prefix = prefix
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
elif attrs and builder.cdata_list_attributes:
|
||||
attrs = builder._replace_cdata_list_attribute_values(
|
||||
self.name, attrs)
|
||||
elif attrs:
|
||||
if builder is not None and builder.cdata_list_attributes:
|
||||
attrs = builder._replace_cdata_list_attribute_values(
|
||||
self.name, attrs)
|
||||
else:
|
||||
attrs = dict(attrs)
|
||||
else:
|
||||
attrs = dict(attrs)
|
||||
self.attrs = attrs
|
||||
@@ -778,6 +815,18 @@ class Tag(PageElement):
|
||||
|
||||
parserClass = _alias("parser_class") # BS3
|
||||
|
||||
def __copy__(self):
|
||||
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
|
||||
Its contents are a copy of the old Tag's contents.
|
||||
"""
|
||||
clone = type(self)(None, self.builder, self.name, self.namespace,
|
||||
self.nsprefix, self.attrs)
|
||||
for attr in ('can_be_empty_element', 'hidden'):
|
||||
setattr(clone, attr, getattr(self, attr))
|
||||
for child in self.contents:
|
||||
clone.append(child.__copy__())
|
||||
return clone
|
||||
|
||||
@property
|
||||
def is_empty_element(self):
|
||||
"""Is this tag an empty-element tag? (aka a self-closing tag)
|
||||
@@ -971,15 +1020,25 @@ class Tag(PageElement):
|
||||
as defined in __eq__."""
|
||||
return not self == other
|
||||
|
||||
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
||||
def __repr__(self, encoding="unicode-escape"):
|
||||
"""Renders this tag as a string."""
|
||||
return self.encode(encoding)
|
||||
if PY3K:
|
||||
# "The return value must be a string object", i.e. Unicode
|
||||
return self.decode()
|
||||
else:
|
||||
# "The return value must be a string object", i.e. a bytestring.
|
||||
# By convention, the return value of __repr__ should also be
|
||||
# an ASCII string.
|
||||
return self.encode(encoding)
|
||||
|
||||
def __unicode__(self):
|
||||
return self.decode()
|
||||
|
||||
def __str__(self):
|
||||
return self.encode()
|
||||
if PY3K:
|
||||
return self.decode()
|
||||
else:
|
||||
return self.encode()
|
||||
|
||||
if PY3K:
|
||||
__str__ = __repr__ = __unicode__
|
||||
@@ -1103,12 +1162,18 @@ class Tag(PageElement):
|
||||
formatter="minimal"):
|
||||
"""Renders the contents of this tag as a Unicode string.
|
||||
|
||||
:param indent_level: Each line of the rendering will be
|
||||
indented this many spaces.
|
||||
|
||||
:param eventual_encoding: The tag is destined to be
|
||||
encoded into this encoding. This method is _not_
|
||||
responsible for performing that encoding. This information
|
||||
is passed in so that it can be substituted in if the
|
||||
document contains a <META> tag that mentions the document's
|
||||
encoding.
|
||||
|
||||
:param formatter: The output formatter responsible for converting
|
||||
entities to Unicode characters.
|
||||
"""
|
||||
# First off, turn a string formatter into a function. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
@@ -1137,7 +1202,17 @@ class Tag(PageElement):
|
||||
def encode_contents(
|
||||
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Renders the contents of this tag as a bytestring."""
|
||||
"""Renders the contents of this tag as a bytestring.
|
||||
|
||||
:param indent_level: Each line of the rendering will be
|
||||
indented this many spaces.
|
||||
|
||||
:param eventual_encoding: The bytestring will be in this encoding.
|
||||
|
||||
:param formatter: The output formatter responsible for converting
|
||||
entities to Unicode characters.
|
||||
"""
|
||||
|
||||
contents = self.decode_contents(indent_level, encoding, formatter)
|
||||
return contents.encode(encoding)
|
||||
|
||||
@@ -1201,63 +1276,89 @@ class Tag(PageElement):
|
||||
|
||||
_selector_combinators = ['>', '+', '~']
|
||||
_select_debug = False
|
||||
def select(self, selector, _candidate_generator=None):
|
||||
def select_one(self, selector):
|
||||
"""Perform a CSS selection operation on the current element."""
|
||||
tokens = selector.split()
|
||||
value = self.select(selector, limit=1)
|
||||
if value:
|
||||
return value[0]
|
||||
return None
|
||||
|
||||
def select(self, selector, _candidate_generator=None, limit=None):
|
||||
"""Perform a CSS selection operation on the current element."""
|
||||
|
||||
# Remove whitespace directly after the grouping operator ','
|
||||
# then split into tokens.
|
||||
tokens = re.sub(',[\s]*',',', selector).split()
|
||||
current_context = [self]
|
||||
|
||||
if tokens[-1] in self._selector_combinators:
|
||||
raise ValueError(
|
||||
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
||||
|
||||
if self._select_debug:
|
||||
print 'Running CSS selector "%s"' % selector
|
||||
for index, token in enumerate(tokens):
|
||||
if self._select_debug:
|
||||
print ' Considering token "%s"' % token
|
||||
recursive_candidate_generator = None
|
||||
tag_name = None
|
||||
|
||||
for index, token_group in enumerate(tokens):
|
||||
new_context = []
|
||||
new_context_ids = set([])
|
||||
|
||||
# Grouping selectors, ie: p,a
|
||||
grouped_tokens = token_group.split(',')
|
||||
if '' in grouped_tokens:
|
||||
raise ValueError('Invalid group selection syntax: %s' % token_group)
|
||||
|
||||
if tokens[index-1] in self._selector_combinators:
|
||||
# This token was consumed by the previous combinator. Skip it.
|
||||
if self._select_debug:
|
||||
print ' Token was consumed by the previous combinator.'
|
||||
continue
|
||||
# Each operation corresponds to a checker function, a rule
|
||||
# for determining whether a candidate matches the
|
||||
# selector. Candidates are generated by the active
|
||||
# iterator.
|
||||
checker = None
|
||||
|
||||
m = self.attribselect_re.match(token)
|
||||
if m is not None:
|
||||
# Attribute selector
|
||||
tag_name, attribute, operator, value = m.groups()
|
||||
checker = self._attribute_checker(operator, attribute, value)
|
||||
for token in grouped_tokens:
|
||||
if self._select_debug:
|
||||
print ' Considering token "%s"' % token
|
||||
recursive_candidate_generator = None
|
||||
tag_name = None
|
||||
|
||||
elif '#' in token:
|
||||
# ID selector
|
||||
tag_name, tag_id = token.split('#', 1)
|
||||
def id_matches(tag):
|
||||
return tag.get('id', None) == tag_id
|
||||
checker = id_matches
|
||||
# Each operation corresponds to a checker function, a rule
|
||||
# for determining whether a candidate matches the
|
||||
# selector. Candidates are generated by the active
|
||||
# iterator.
|
||||
checker = None
|
||||
|
||||
elif '.' in token:
|
||||
# Class selector
|
||||
tag_name, klass = token.split('.', 1)
|
||||
classes = set(klass.split('.'))
|
||||
def classes_match(candidate):
|
||||
return classes.issubset(candidate.get('class', []))
|
||||
checker = classes_match
|
||||
m = self.attribselect_re.match(token)
|
||||
if m is not None:
|
||||
# Attribute selector
|
||||
tag_name, attribute, operator, value = m.groups()
|
||||
checker = self._attribute_checker(operator, attribute, value)
|
||||
|
||||
elif ':' in token:
|
||||
# Pseudo-class
|
||||
tag_name, pseudo = token.split(':', 1)
|
||||
if tag_name == '':
|
||||
raise ValueError(
|
||||
"A pseudo-class must be prefixed with a tag name.")
|
||||
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||
found = []
|
||||
if pseudo_attributes is not None:
|
||||
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
||||
elif '#' in token:
|
||||
# ID selector
|
||||
tag_name, tag_id = token.split('#', 1)
|
||||
def id_matches(tag):
|
||||
return tag.get('id', None) == tag_id
|
||||
checker = id_matches
|
||||
|
||||
elif '.' in token:
|
||||
# Class selector
|
||||
tag_name, klass = token.split('.', 1)
|
||||
classes = set(klass.split('.'))
|
||||
def classes_match(candidate):
|
||||
return classes.issubset(candidate.get('class', []))
|
||||
checker = classes_match
|
||||
|
||||
elif ':' in token:
|
||||
# Pseudo-class
|
||||
tag_name, pseudo = token.split(':', 1)
|
||||
if tag_name == '':
|
||||
raise ValueError(
|
||||
"A pseudo-class must be prefixed with a tag name.")
|
||||
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||
found = []
|
||||
if pseudo_attributes is None:
|
||||
pseudo_type = pseudo
|
||||
pseudo_value = None
|
||||
else:
|
||||
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
||||
if pseudo_type == 'nth-of-type':
|
||||
try:
|
||||
pseudo_value = int(pseudo_value)
|
||||
@@ -1286,109 +1387,110 @@ class Tag(PageElement):
|
||||
raise NotImplementedError(
|
||||
'Only the following pseudo-classes are implemented: nth-of-type.')
|
||||
|
||||
elif token == '*':
|
||||
# Star selector -- matches everything
|
||||
pass
|
||||
elif token == '>':
|
||||
# Run the next token as a CSS selector against the
|
||||
# direct children of each tag in the current context.
|
||||
recursive_candidate_generator = lambda tag: tag.children
|
||||
elif token == '~':
|
||||
# Run the next token as a CSS selector against the
|
||||
# siblings of each tag in the current context.
|
||||
recursive_candidate_generator = lambda tag: tag.next_siblings
|
||||
elif token == '+':
|
||||
# For each tag in the current context, run the next
|
||||
# token as a CSS selector against the tag's next
|
||||
# sibling that's a tag.
|
||||
def next_tag_sibling(tag):
|
||||
yield tag.find_next_sibling(True)
|
||||
recursive_candidate_generator = next_tag_sibling
|
||||
elif token == '*':
|
||||
# Star selector -- matches everything
|
||||
pass
|
||||
elif token == '>':
|
||||
# Run the next token as a CSS selector against the
|
||||
# direct children of each tag in the current context.
|
||||
recursive_candidate_generator = lambda tag: tag.children
|
||||
elif token == '~':
|
||||
# Run the next token as a CSS selector against the
|
||||
# siblings of each tag in the current context.
|
||||
recursive_candidate_generator = lambda tag: tag.next_siblings
|
||||
elif token == '+':
|
||||
# For each tag in the current context, run the next
|
||||
# token as a CSS selector against the tag's next
|
||||
# sibling that's a tag.
|
||||
def next_tag_sibling(tag):
|
||||
yield tag.find_next_sibling(True)
|
||||
recursive_candidate_generator = next_tag_sibling
|
||||
|
||||
elif self.tag_name_re.match(token):
|
||||
# Just a tag name.
|
||||
tag_name = token
|
||||
else:
|
||||
raise ValueError(
|
||||
'Unsupported or invalid CSS selector: "%s"' % token)
|
||||
|
||||
if recursive_candidate_generator:
|
||||
# This happens when the selector looks like "> foo".
|
||||
#
|
||||
# The generator calls select() recursively on every
|
||||
# member of the current context, passing in a different
|
||||
# candidate generator and a different selector.
|
||||
#
|
||||
# In the case of "> foo", the candidate generator is
|
||||
# one that yields a tag's direct children (">"), and
|
||||
# the selector is "foo".
|
||||
next_token = tokens[index+1]
|
||||
def recursive_select(tag):
|
||||
if self._select_debug:
|
||||
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
||||
print '-' * 40
|
||||
for i in tag.select(next_token, recursive_candidate_generator):
|
||||
if self._select_debug:
|
||||
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
||||
yield i
|
||||
if self._select_debug:
|
||||
print '-' * 40
|
||||
_use_candidate_generator = recursive_select
|
||||
elif _candidate_generator is None:
|
||||
# By default, a tag's candidates are all of its
|
||||
# children. If tag_name is defined, only yield tags
|
||||
# with that name.
|
||||
if self._select_debug:
|
||||
if tag_name:
|
||||
check = "[any]"
|
||||
else:
|
||||
check = tag_name
|
||||
print ' Default candidate generator, tag name="%s"' % check
|
||||
if self._select_debug:
|
||||
# This is redundant with later code, but it stops
|
||||
# a bunch of bogus tags from cluttering up the
|
||||
# debug log.
|
||||
def default_candidate_generator(tag):
|
||||
for child in tag.descendants:
|
||||
if not isinstance(child, Tag):
|
||||
continue
|
||||
if tag_name and not child.name == tag_name:
|
||||
continue
|
||||
yield child
|
||||
_use_candidate_generator = default_candidate_generator
|
||||
elif self.tag_name_re.match(token):
|
||||
# Just a tag name.
|
||||
tag_name = token
|
||||
else:
|
||||
_use_candidate_generator = lambda tag: tag.descendants
|
||||
else:
|
||||
_use_candidate_generator = _candidate_generator
|
||||
|
||||
new_context = []
|
||||
new_context_ids = set([])
|
||||
for tag in current_context:
|
||||
if self._select_debug:
|
||||
print " Running candidate generator on %s %s" % (
|
||||
tag.name, repr(tag.attrs))
|
||||
for candidate in _use_candidate_generator(tag):
|
||||
if not isinstance(candidate, Tag):
|
||||
continue
|
||||
if tag_name and candidate.name != tag_name:
|
||||
continue
|
||||
if checker is not None:
|
||||
try:
|
||||
result = checker(candidate)
|
||||
except StopIteration:
|
||||
# The checker has decided we should no longer
|
||||
# run the generator.
|
||||
break
|
||||
if checker is None or result:
|
||||
raise ValueError(
|
||||
'Unsupported or invalid CSS selector: "%s"' % token)
|
||||
if recursive_candidate_generator:
|
||||
# This happens when the selector looks like "> foo".
|
||||
#
|
||||
# The generator calls select() recursively on every
|
||||
# member of the current context, passing in a different
|
||||
# candidate generator and a different selector.
|
||||
#
|
||||
# In the case of "> foo", the candidate generator is
|
||||
# one that yields a tag's direct children (">"), and
|
||||
# the selector is "foo".
|
||||
next_token = tokens[index+1]
|
||||
def recursive_select(tag):
|
||||
if self._select_debug:
|
||||
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
if id(candidate) not in new_context_ids:
|
||||
# If a tag matches a selector more than once,
|
||||
# don't include it in the context more than once.
|
||||
new_context.append(candidate)
|
||||
new_context_ids.add(id(candidate))
|
||||
elif self._select_debug:
|
||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
||||
print '-' * 40
|
||||
for i in tag.select(next_token, recursive_candidate_generator):
|
||||
if self._select_debug:
|
||||
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
||||
yield i
|
||||
if self._select_debug:
|
||||
print '-' * 40
|
||||
_use_candidate_generator = recursive_select
|
||||
elif _candidate_generator is None:
|
||||
# By default, a tag's candidates are all of its
|
||||
# children. If tag_name is defined, only yield tags
|
||||
# with that name.
|
||||
if self._select_debug:
|
||||
if tag_name:
|
||||
check = "[any]"
|
||||
else:
|
||||
check = tag_name
|
||||
print ' Default candidate generator, tag name="%s"' % check
|
||||
if self._select_debug:
|
||||
# This is redundant with later code, but it stops
|
||||
# a bunch of bogus tags from cluttering up the
|
||||
# debug log.
|
||||
def default_candidate_generator(tag):
|
||||
for child in tag.descendants:
|
||||
if not isinstance(child, Tag):
|
||||
continue
|
||||
if tag_name and not child.name == tag_name:
|
||||
continue
|
||||
yield child
|
||||
_use_candidate_generator = default_candidate_generator
|
||||
else:
|
||||
_use_candidate_generator = lambda tag: tag.descendants
|
||||
else:
|
||||
_use_candidate_generator = _candidate_generator
|
||||
|
||||
count = 0
|
||||
for tag in current_context:
|
||||
if self._select_debug:
|
||||
print " Running candidate generator on %s %s" % (
|
||||
tag.name, repr(tag.attrs))
|
||||
for candidate in _use_candidate_generator(tag):
|
||||
if not isinstance(candidate, Tag):
|
||||
continue
|
||||
if tag_name and candidate.name != tag_name:
|
||||
continue
|
||||
if checker is not None:
|
||||
try:
|
||||
result = checker(candidate)
|
||||
except StopIteration:
|
||||
# The checker has decided we should no longer
|
||||
# run the generator.
|
||||
break
|
||||
if checker is None or result:
|
||||
if self._select_debug:
|
||||
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
if id(candidate) not in new_context_ids:
|
||||
# If a tag matches a selector more than once,
|
||||
# don't include it in the context more than once.
|
||||
new_context.append(candidate)
|
||||
new_context_ids.add(id(candidate))
|
||||
if limit and len(new_context) >= limit:
|
||||
break
|
||||
elif self._select_debug:
|
||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
|
||||
|
||||
current_context = new_context
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Helper classes for tests."""
|
||||
|
||||
import pickle
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
@@ -43,6 +44,16 @@ class SoupTest(unittest.TestCase):
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
def assertConnectedness(self, element):
|
||||
"""Ensure that next_element and previous_element are properly
|
||||
set for all descendants of the given element.
|
||||
"""
|
||||
earlier = None
|
||||
for e in element.descendants:
|
||||
if earlier:
|
||||
self.assertEqual(e, earlier.next_element)
|
||||
self.assertEqual(earlier, e.previous_element)
|
||||
earlier = e
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
@@ -54,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
@@ -114,6 +134,11 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
@@ -155,6 +180,23 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_double_head(self):
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Ordinary HEAD element test</title>
|
||||
</head>
|
||||
<script type="text/javascript">
|
||||
alert("Help!");
|
||||
</script>
|
||||
<body>
|
||||
Hello, world!
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
soup = self.soup(html)
|
||||
self.assertEqual("text/javascript", soup.find('script')['type'])
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
@@ -221,6 +263,14 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_multivalued_attribute_on_html(self):
|
||||
# html5lib uses a different API to set the attributes ot the
|
||||
# <html> tag. This has caused problems with multivalued
|
||||
# attributes.
|
||||
markup = '<html class="a b"></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["a", "b"], soup.html['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
@@ -253,6 +303,35 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||
self.assertEqual("p", soup.p.name)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_head_tag_between_head_and_body(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<html><head></head>
|
||||
<link></link>
|
||||
<body>foo</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertNotEqual(None, soup.html.body)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_multiple_copies_of_a_tag(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<article id="a" >
|
||||
<div><a href="1"></div>
|
||||
<footer>
|
||||
<a href="2"></a>
|
||||
</footer>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertConnectedness(soup.article)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
@@ -463,6 +542,15 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
@@ -485,7 +573,7 @@ class XMLTreeBuilderSmokeTest(object):
|
||||
<script type="text/javascript">
|
||||
</script>
|
||||
"""
|
||||
soup = BeautifulSoup(doc, "xml")
|
||||
soup = BeautifulSoup(doc, "lxml-xml")
|
||||
# lxml would have stripped this while parsing, but we can add
|
||||
# it later.
|
||||
soup.script.string = 'console.log("< < hey > > ");'
|
||||
|
||||
@@ -20,4 +20,6 @@ from .serializer import serialize
|
||||
|
||||
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
||||
"getTreeWalker", "serialize"]
|
||||
__version__ = "0.999"
|
||||
|
||||
# this has to be at the top level, see how setup.py parses this
|
||||
__version__ = "0.999999"
|
||||
|
||||
@@ -1,292 +1,290 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import string
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
EOF = None
|
||||
|
||||
E = {
|
||||
"null-character":
|
||||
_("Null character in input stream, replaced with U+FFFD."),
|
||||
"Null character in input stream, replaced with U+FFFD.",
|
||||
"invalid-codepoint":
|
||||
_("Invalid codepoint in stream."),
|
||||
"Invalid codepoint in stream.",
|
||||
"incorrectly-placed-solidus":
|
||||
_("Solidus (/) incorrectly placed in tag."),
|
||||
"Solidus (/) incorrectly placed in tag.",
|
||||
"incorrect-cr-newline-entity":
|
||||
_("Incorrect CR newline entity, replaced with LF."),
|
||||
"Incorrect CR newline entity, replaced with LF.",
|
||||
"illegal-windows-1252-entity":
|
||||
_("Entity used with illegal number (windows-1252 reference)."),
|
||||
"Entity used with illegal number (windows-1252 reference).",
|
||||
"cant-convert-numeric-entity":
|
||||
_("Numeric entity couldn't be converted to character "
|
||||
"(codepoint U+%(charAsInt)08x)."),
|
||||
"Numeric entity couldn't be converted to character "
|
||||
"(codepoint U+%(charAsInt)08x).",
|
||||
"illegal-codepoint-for-numeric-entity":
|
||||
_("Numeric entity represents an illegal codepoint: "
|
||||
"U+%(charAsInt)08x."),
|
||||
"Numeric entity represents an illegal codepoint: "
|
||||
"U+%(charAsInt)08x.",
|
||||
"numeric-entity-without-semicolon":
|
||||
_("Numeric entity didn't end with ';'."),
|
||||
"Numeric entity didn't end with ';'.",
|
||||
"expected-numeric-entity-but-got-eof":
|
||||
_("Numeric entity expected. Got end of file instead."),
|
||||
"Numeric entity expected. Got end of file instead.",
|
||||
"expected-numeric-entity":
|
||||
_("Numeric entity expected but none found."),
|
||||
"Numeric entity expected but none found.",
|
||||
"named-entity-without-semicolon":
|
||||
_("Named entity didn't end with ';'."),
|
||||
"Named entity didn't end with ';'.",
|
||||
"expected-named-entity":
|
||||
_("Named entity expected. Got none."),
|
||||
"Named entity expected. Got none.",
|
||||
"attributes-in-end-tag":
|
||||
_("End tag contains unexpected attributes."),
|
||||
"End tag contains unexpected attributes.",
|
||||
'self-closing-flag-on-end-tag':
|
||||
_("End tag contains unexpected self-closing flag."),
|
||||
"End tag contains unexpected self-closing flag.",
|
||||
"expected-tag-name-but-got-right-bracket":
|
||||
_("Expected tag name. Got '>' instead."),
|
||||
"Expected tag name. Got '>' instead.",
|
||||
"expected-tag-name-but-got-question-mark":
|
||||
_("Expected tag name. Got '?' instead. (HTML doesn't "
|
||||
"support processing instructions.)"),
|
||||
"Expected tag name. Got '?' instead. (HTML doesn't "
|
||||
"support processing instructions.)",
|
||||
"expected-tag-name":
|
||||
_("Expected tag name. Got something else instead"),
|
||||
"Expected tag name. Got something else instead",
|
||||
"expected-closing-tag-but-got-right-bracket":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
||||
"Expected closing tag. Got '>' instead. Ignoring '</>'.",
|
||||
"expected-closing-tag-but-got-eof":
|
||||
_("Expected closing tag. Unexpected end of file."),
|
||||
"Expected closing tag. Unexpected end of file.",
|
||||
"expected-closing-tag-but-got-char":
|
||||
_("Expected closing tag. Unexpected character '%(data)s' found."),
|
||||
"Expected closing tag. Unexpected character '%(data)s' found.",
|
||||
"eof-in-tag-name":
|
||||
_("Unexpected end of file in the tag name."),
|
||||
"Unexpected end of file in the tag name.",
|
||||
"expected-attribute-name-but-got-eof":
|
||||
_("Unexpected end of file. Expected attribute name instead."),
|
||||
"Unexpected end of file. Expected attribute name instead.",
|
||||
"eof-in-attribute-name":
|
||||
_("Unexpected end of file in attribute name."),
|
||||
"Unexpected end of file in attribute name.",
|
||||
"invalid-character-in-attribute-name":
|
||||
_("Invalid character in attribute name"),
|
||||
"Invalid character in attribute name",
|
||||
"duplicate-attribute":
|
||||
_("Dropped duplicate attribute on tag."),
|
||||
"Dropped duplicate attribute on tag.",
|
||||
"expected-end-of-tag-name-but-got-eof":
|
||||
_("Unexpected end of file. Expected = or end of tag."),
|
||||
"Unexpected end of file. Expected = or end of tag.",
|
||||
"expected-attribute-value-but-got-eof":
|
||||
_("Unexpected end of file. Expected attribute value."),
|
||||
"Unexpected end of file. Expected attribute value.",
|
||||
"expected-attribute-value-but-got-right-bracket":
|
||||
_("Expected attribute value. Got '>' instead."),
|
||||
"Expected attribute value. Got '>' instead.",
|
||||
'equals-in-unquoted-attribute-value':
|
||||
_("Unexpected = in unquoted attribute"),
|
||||
"Unexpected = in unquoted attribute",
|
||||
'unexpected-character-in-unquoted-attribute-value':
|
||||
_("Unexpected character in unquoted attribute"),
|
||||
"Unexpected character in unquoted attribute",
|
||||
"invalid-character-after-attribute-name":
|
||||
_("Unexpected character after attribute name."),
|
||||
"Unexpected character after attribute name.",
|
||||
"unexpected-character-after-attribute-value":
|
||||
_("Unexpected character after attribute value."),
|
||||
"Unexpected character after attribute value.",
|
||||
"eof-in-attribute-value-double-quote":
|
||||
_("Unexpected end of file in attribute value (\")."),
|
||||
"Unexpected end of file in attribute value (\").",
|
||||
"eof-in-attribute-value-single-quote":
|
||||
_("Unexpected end of file in attribute value (')."),
|
||||
"Unexpected end of file in attribute value (').",
|
||||
"eof-in-attribute-value-no-quotes":
|
||||
_("Unexpected end of file in attribute value."),
|
||||
"Unexpected end of file in attribute value.",
|
||||
"unexpected-EOF-after-solidus-in-tag":
|
||||
_("Unexpected end of file in tag. Expected >"),
|
||||
"Unexpected end of file in tag. Expected >",
|
||||
"unexpected-character-after-solidus-in-tag":
|
||||
_("Unexpected character after / in tag. Expected >"),
|
||||
"Unexpected character after / in tag. Expected >",
|
||||
"expected-dashes-or-doctype":
|
||||
_("Expected '--' or 'DOCTYPE'. Not found."),
|
||||
"Expected '--' or 'DOCTYPE'. Not found.",
|
||||
"unexpected-bang-after-double-dash-in-comment":
|
||||
_("Unexpected ! after -- in comment"),
|
||||
"Unexpected ! after -- in comment",
|
||||
"unexpected-space-after-double-dash-in-comment":
|
||||
_("Unexpected space after -- in comment"),
|
||||
"Unexpected space after -- in comment",
|
||||
"incorrect-comment":
|
||||
_("Incorrect comment."),
|
||||
"Incorrect comment.",
|
||||
"eof-in-comment":
|
||||
_("Unexpected end of file in comment."),
|
||||
"Unexpected end of file in comment.",
|
||||
"eof-in-comment-end-dash":
|
||||
_("Unexpected end of file in comment (-)"),
|
||||
"Unexpected end of file in comment (-)",
|
||||
"unexpected-dash-after-double-dash-in-comment":
|
||||
_("Unexpected '-' after '--' found in comment."),
|
||||
"Unexpected '-' after '--' found in comment.",
|
||||
"eof-in-comment-double-dash":
|
||||
_("Unexpected end of file in comment (--)."),
|
||||
"Unexpected end of file in comment (--).",
|
||||
"eof-in-comment-end-space-state":
|
||||
_("Unexpected end of file in comment."),
|
||||
"Unexpected end of file in comment.",
|
||||
"eof-in-comment-end-bang-state":
|
||||
_("Unexpected end of file in comment."),
|
||||
"Unexpected end of file in comment.",
|
||||
"unexpected-char-in-comment":
|
||||
_("Unexpected character in comment found."),
|
||||
"Unexpected character in comment found.",
|
||||
"need-space-after-doctype":
|
||||
_("No space after literal string 'DOCTYPE'."),
|
||||
"No space after literal string 'DOCTYPE'.",
|
||||
"expected-doctype-name-but-got-right-bracket":
|
||||
_("Unexpected > character. Expected DOCTYPE name."),
|
||||
"Unexpected > character. Expected DOCTYPE name.",
|
||||
"expected-doctype-name-but-got-eof":
|
||||
_("Unexpected end of file. Expected DOCTYPE name."),
|
||||
"Unexpected end of file. Expected DOCTYPE name.",
|
||||
"eof-in-doctype-name":
|
||||
_("Unexpected end of file in DOCTYPE name."),
|
||||
"Unexpected end of file in DOCTYPE name.",
|
||||
"eof-in-doctype":
|
||||
_("Unexpected end of file in DOCTYPE."),
|
||||
"Unexpected end of file in DOCTYPE.",
|
||||
"expected-space-or-right-bracket-in-doctype":
|
||||
_("Expected space or '>'. Got '%(data)s'"),
|
||||
"Expected space or '>'. Got '%(data)s'",
|
||||
"unexpected-end-of-doctype":
|
||||
_("Unexpected end of DOCTYPE."),
|
||||
"Unexpected end of DOCTYPE.",
|
||||
"unexpected-char-in-doctype":
|
||||
_("Unexpected character in DOCTYPE."),
|
||||
"Unexpected character in DOCTYPE.",
|
||||
"eof-in-innerhtml":
|
||||
_("XXX innerHTML EOF"),
|
||||
"XXX innerHTML EOF",
|
||||
"unexpected-doctype":
|
||||
_("Unexpected DOCTYPE. Ignored."),
|
||||
"Unexpected DOCTYPE. Ignored.",
|
||||
"non-html-root":
|
||||
_("html needs to be the first start tag."),
|
||||
"html needs to be the first start tag.",
|
||||
"expected-doctype-but-got-eof":
|
||||
_("Unexpected End of file. Expected DOCTYPE."),
|
||||
"Unexpected End of file. Expected DOCTYPE.",
|
||||
"unknown-doctype":
|
||||
_("Erroneous DOCTYPE."),
|
||||
"Erroneous DOCTYPE.",
|
||||
"expected-doctype-but-got-chars":
|
||||
_("Unexpected non-space characters. Expected DOCTYPE."),
|
||||
"Unexpected non-space characters. Expected DOCTYPE.",
|
||||
"expected-doctype-but-got-start-tag":
|
||||
_("Unexpected start tag (%(name)s). Expected DOCTYPE."),
|
||||
"Unexpected start tag (%(name)s). Expected DOCTYPE.",
|
||||
"expected-doctype-but-got-end-tag":
|
||||
_("Unexpected end tag (%(name)s). Expected DOCTYPE."),
|
||||
"Unexpected end tag (%(name)s). Expected DOCTYPE.",
|
||||
"end-tag-after-implied-root":
|
||||
_("Unexpected end tag (%(name)s) after the (implied) root element."),
|
||||
"Unexpected end tag (%(name)s) after the (implied) root element.",
|
||||
"expected-named-closing-tag-but-got-eof":
|
||||
_("Unexpected end of file. Expected end tag (%(name)s)."),
|
||||
"Unexpected end of file. Expected end tag (%(name)s).",
|
||||
"two-heads-are-not-better-than-one":
|
||||
_("Unexpected start tag head in existing head. Ignored."),
|
||||
"Unexpected start tag head in existing head. Ignored.",
|
||||
"unexpected-end-tag":
|
||||
_("Unexpected end tag (%(name)s). Ignored."),
|
||||
"Unexpected end tag (%(name)s). Ignored.",
|
||||
"unexpected-start-tag-out-of-my-head":
|
||||
_("Unexpected start tag (%(name)s) that can be in head. Moved."),
|
||||
"Unexpected start tag (%(name)s) that can be in head. Moved.",
|
||||
"unexpected-start-tag":
|
||||
_("Unexpected start tag (%(name)s)."),
|
||||
"Unexpected start tag (%(name)s).",
|
||||
"missing-end-tag":
|
||||
_("Missing end tag (%(name)s)."),
|
||||
"Missing end tag (%(name)s).",
|
||||
"missing-end-tags":
|
||||
_("Missing end tags (%(name)s)."),
|
||||
"Missing end tags (%(name)s).",
|
||||
"unexpected-start-tag-implies-end-tag":
|
||||
_("Unexpected start tag (%(startName)s) "
|
||||
"implies end tag (%(endName)s)."),
|
||||
"Unexpected start tag (%(startName)s) "
|
||||
"implies end tag (%(endName)s).",
|
||||
"unexpected-start-tag-treated-as":
|
||||
_("Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
|
||||
"Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
|
||||
"deprecated-tag":
|
||||
_("Unexpected start tag %(name)s. Don't use it!"),
|
||||
"Unexpected start tag %(name)s. Don't use it!",
|
||||
"unexpected-start-tag-ignored":
|
||||
_("Unexpected start tag %(name)s. Ignored."),
|
||||
"Unexpected start tag %(name)s. Ignored.",
|
||||
"expected-one-end-tag-but-got-another":
|
||||
_("Unexpected end tag (%(gotName)s). "
|
||||
"Missing end tag (%(expectedName)s)."),
|
||||
"Unexpected end tag (%(gotName)s). "
|
||||
"Missing end tag (%(expectedName)s).",
|
||||
"end-tag-too-early":
|
||||
_("End tag (%(name)s) seen too early. Expected other end tag."),
|
||||
"End tag (%(name)s) seen too early. Expected other end tag.",
|
||||
"end-tag-too-early-named":
|
||||
_("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
|
||||
"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
|
||||
"end-tag-too-early-ignored":
|
||||
_("End tag (%(name)s) seen too early. Ignored."),
|
||||
"End tag (%(name)s) seen too early. Ignored.",
|
||||
"adoption-agency-1.1":
|
||||
_("End tag (%(name)s) violates step 1, "
|
||||
"paragraph 1 of the adoption agency algorithm."),
|
||||
"End tag (%(name)s) violates step 1, "
|
||||
"paragraph 1 of the adoption agency algorithm.",
|
||||
"adoption-agency-1.2":
|
||||
_("End tag (%(name)s) violates step 1, "
|
||||
"paragraph 2 of the adoption agency algorithm."),
|
||||
"End tag (%(name)s) violates step 1, "
|
||||
"paragraph 2 of the adoption agency algorithm.",
|
||||
"adoption-agency-1.3":
|
||||
_("End tag (%(name)s) violates step 1, "
|
||||
"paragraph 3 of the adoption agency algorithm."),
|
||||
"End tag (%(name)s) violates step 1, "
|
||||
"paragraph 3 of the adoption agency algorithm.",
|
||||
"adoption-agency-4.4":
|
||||
_("End tag (%(name)s) violates step 4, "
|
||||
"paragraph 4 of the adoption agency algorithm."),
|
||||
"End tag (%(name)s) violates step 4, "
|
||||
"paragraph 4 of the adoption agency algorithm.",
|
||||
"unexpected-end-tag-treated-as":
|
||||
_("Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
|
||||
"Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
|
||||
"no-end-tag":
|
||||
_("This element (%(name)s) has no end tag."),
|
||||
"This element (%(name)s) has no end tag.",
|
||||
"unexpected-implied-end-tag-in-table":
|
||||
_("Unexpected implied end tag (%(name)s) in the table phase."),
|
||||
"Unexpected implied end tag (%(name)s) in the table phase.",
|
||||
"unexpected-implied-end-tag-in-table-body":
|
||||
_("Unexpected implied end tag (%(name)s) in the table body phase."),
|
||||
"Unexpected implied end tag (%(name)s) in the table body phase.",
|
||||
"unexpected-char-implies-table-voodoo":
|
||||
_("Unexpected non-space characters in "
|
||||
"table context caused voodoo mode."),
|
||||
"Unexpected non-space characters in "
|
||||
"table context caused voodoo mode.",
|
||||
"unexpected-hidden-input-in-table":
|
||||
_("Unexpected input with type hidden in table context."),
|
||||
"Unexpected input with type hidden in table context.",
|
||||
"unexpected-form-in-table":
|
||||
_("Unexpected form in table context."),
|
||||
"Unexpected form in table context.",
|
||||
"unexpected-start-tag-implies-table-voodoo":
|
||||
_("Unexpected start tag (%(name)s) in "
|
||||
"table context caused voodoo mode."),
|
||||
"Unexpected start tag (%(name)s) in "
|
||||
"table context caused voodoo mode.",
|
||||
"unexpected-end-tag-implies-table-voodoo":
|
||||
_("Unexpected end tag (%(name)s) in "
|
||||
"table context caused voodoo mode."),
|
||||
"Unexpected end tag (%(name)s) in "
|
||||
"table context caused voodoo mode.",
|
||||
"unexpected-cell-in-table-body":
|
||||
_("Unexpected table cell start tag (%(name)s) "
|
||||
"in the table body phase."),
|
||||
"Unexpected table cell start tag (%(name)s) "
|
||||
"in the table body phase.",
|
||||
"unexpected-cell-end-tag":
|
||||
_("Got table cell end tag (%(name)s) "
|
||||
"while required end tags are missing."),
|
||||
"Got table cell end tag (%(name)s) "
|
||||
"while required end tags are missing.",
|
||||
"unexpected-end-tag-in-table-body":
|
||||
_("Unexpected end tag (%(name)s) in the table body phase. Ignored."),
|
||||
"Unexpected end tag (%(name)s) in the table body phase. Ignored.",
|
||||
"unexpected-implied-end-tag-in-table-row":
|
||||
_("Unexpected implied end tag (%(name)s) in the table row phase."),
|
||||
"Unexpected implied end tag (%(name)s) in the table row phase.",
|
||||
"unexpected-end-tag-in-table-row":
|
||||
_("Unexpected end tag (%(name)s) in the table row phase. Ignored."),
|
||||
"Unexpected end tag (%(name)s) in the table row phase. Ignored.",
|
||||
"unexpected-select-in-select":
|
||||
_("Unexpected select start tag in the select phase "
|
||||
"treated as select end tag."),
|
||||
"Unexpected select start tag in the select phase "
|
||||
"treated as select end tag.",
|
||||
"unexpected-input-in-select":
|
||||
_("Unexpected input start tag in the select phase."),
|
||||
"Unexpected input start tag in the select phase.",
|
||||
"unexpected-start-tag-in-select":
|
||||
_("Unexpected start tag token (%(name)s in the select phase. "
|
||||
"Ignored."),
|
||||
"Unexpected start tag token (%(name)s in the select phase. "
|
||||
"Ignored.",
|
||||
"unexpected-end-tag-in-select":
|
||||
_("Unexpected end tag (%(name)s) in the select phase. Ignored."),
|
||||
"Unexpected end tag (%(name)s) in the select phase. Ignored.",
|
||||
"unexpected-table-element-start-tag-in-select-in-table":
|
||||
_("Unexpected table element start tag (%(name)s) in the select in table phase."),
|
||||
"Unexpected table element start tag (%(name)s) in the select in table phase.",
|
||||
"unexpected-table-element-end-tag-in-select-in-table":
|
||||
_("Unexpected table element end tag (%(name)s) in the select in table phase."),
|
||||
"Unexpected table element end tag (%(name)s) in the select in table phase.",
|
||||
"unexpected-char-after-body":
|
||||
_("Unexpected non-space characters in the after body phase."),
|
||||
"Unexpected non-space characters in the after body phase.",
|
||||
"unexpected-start-tag-after-body":
|
||||
_("Unexpected start tag token (%(name)s)"
|
||||
" in the after body phase."),
|
||||
"Unexpected start tag token (%(name)s)"
|
||||
" in the after body phase.",
|
||||
"unexpected-end-tag-after-body":
|
||||
_("Unexpected end tag token (%(name)s)"
|
||||
" in the after body phase."),
|
||||
"Unexpected end tag token (%(name)s)"
|
||||
" in the after body phase.",
|
||||
"unexpected-char-in-frameset":
|
||||
_("Unexpected characters in the frameset phase. Characters ignored."),
|
||||
"Unexpected characters in the frameset phase. Characters ignored.",
|
||||
"unexpected-start-tag-in-frameset":
|
||||
_("Unexpected start tag token (%(name)s)"
|
||||
" in the frameset phase. Ignored."),
|
||||
"Unexpected start tag token (%(name)s)"
|
||||
" in the frameset phase. Ignored.",
|
||||
"unexpected-frameset-in-frameset-innerhtml":
|
||||
_("Unexpected end tag token (frameset) "
|
||||
"in the frameset phase (innerHTML)."),
|
||||
"Unexpected end tag token (frameset) "
|
||||
"in the frameset phase (innerHTML).",
|
||||
"unexpected-end-tag-in-frameset":
|
||||
_("Unexpected end tag token (%(name)s)"
|
||||
" in the frameset phase. Ignored."),
|
||||
"Unexpected end tag token (%(name)s)"
|
||||
" in the frameset phase. Ignored.",
|
||||
"unexpected-char-after-frameset":
|
||||
_("Unexpected non-space characters in the "
|
||||
"after frameset phase. Ignored."),
|
||||
"Unexpected non-space characters in the "
|
||||
"after frameset phase. Ignored.",
|
||||
"unexpected-start-tag-after-frameset":
|
||||
_("Unexpected start tag (%(name)s)"
|
||||
" in the after frameset phase. Ignored."),
|
||||
"Unexpected start tag (%(name)s)"
|
||||
" in the after frameset phase. Ignored.",
|
||||
"unexpected-end-tag-after-frameset":
|
||||
_("Unexpected end tag (%(name)s)"
|
||||
" in the after frameset phase. Ignored."),
|
||||
"Unexpected end tag (%(name)s)"
|
||||
" in the after frameset phase. Ignored.",
|
||||
"unexpected-end-tag-after-body-innerhtml":
|
||||
_("Unexpected end tag after body(innerHtml)"),
|
||||
"Unexpected end tag after body(innerHtml)",
|
||||
"expected-eof-but-got-char":
|
||||
_("Unexpected non-space characters. Expected end of file."),
|
||||
"Unexpected non-space characters. Expected end of file.",
|
||||
"expected-eof-but-got-start-tag":
|
||||
_("Unexpected start tag (%(name)s)"
|
||||
". Expected end of file."),
|
||||
"Unexpected start tag (%(name)s)"
|
||||
". Expected end of file.",
|
||||
"expected-eof-but-got-end-tag":
|
||||
_("Unexpected end tag (%(name)s)"
|
||||
". Expected end of file."),
|
||||
"Unexpected end tag (%(name)s)"
|
||||
". Expected end of file.",
|
||||
"eof-in-table":
|
||||
_("Unexpected end of file. Expected table content."),
|
||||
"Unexpected end of file. Expected table content.",
|
||||
"eof-in-select":
|
||||
_("Unexpected end of file. Expected select content."),
|
||||
"Unexpected end of file. Expected select content.",
|
||||
"eof-in-frameset":
|
||||
_("Unexpected end of file. Expected frameset content."),
|
||||
"Unexpected end of file. Expected frameset content.",
|
||||
"eof-in-script-in-script":
|
||||
_("Unexpected end of file. Expected script content."),
|
||||
"Unexpected end of file. Expected script content.",
|
||||
"eof-in-foreign-lands":
|
||||
_("Unexpected end of file. Expected foreign content"),
|
||||
"Unexpected end of file. Expected foreign content",
|
||||
"non-void-element-with-trailing-solidus":
|
||||
_("Trailing solidus not allowed on element %(name)s"),
|
||||
"Trailing solidus not allowed on element %(name)s",
|
||||
"unexpected-html-element-in-foreign-content":
|
||||
_("Element %(name)s not allowed in a non-html context"),
|
||||
"Element %(name)s not allowed in a non-html context",
|
||||
"unexpected-end-tag-before-html":
|
||||
_("Unexpected end tag (%(name)s) before html."),
|
||||
"Unexpected end tag (%(name)s) before html.",
|
||||
"XXX-undefined-error":
|
||||
_("Undefined error (this sucks and should be fixed)"),
|
||||
"Undefined error (this sucks and should be fixed)",
|
||||
}
|
||||
|
||||
namespaces = {
|
||||
@@ -298,7 +296,7 @@ namespaces = {
|
||||
"xmlns": "http://www.w3.org/2000/xmlns/"
|
||||
}
|
||||
|
||||
scopingElements = frozenset((
|
||||
scopingElements = frozenset([
|
||||
(namespaces["html"], "applet"),
|
||||
(namespaces["html"], "caption"),
|
||||
(namespaces["html"], "html"),
|
||||
@@ -316,9 +314,9 @@ scopingElements = frozenset((
|
||||
(namespaces["svg"], "foreignObject"),
|
||||
(namespaces["svg"], "desc"),
|
||||
(namespaces["svg"], "title"),
|
||||
))
|
||||
])
|
||||
|
||||
formattingElements = frozenset((
|
||||
formattingElements = frozenset([
|
||||
(namespaces["html"], "a"),
|
||||
(namespaces["html"], "b"),
|
||||
(namespaces["html"], "big"),
|
||||
@@ -333,9 +331,9 @@ formattingElements = frozenset((
|
||||
(namespaces["html"], "strong"),
|
||||
(namespaces["html"], "tt"),
|
||||
(namespaces["html"], "u")
|
||||
))
|
||||
])
|
||||
|
||||
specialElements = frozenset((
|
||||
specialElements = frozenset([
|
||||
(namespaces["html"], "address"),
|
||||
(namespaces["html"], "applet"),
|
||||
(namespaces["html"], "area"),
|
||||
@@ -416,22 +414,22 @@ specialElements = frozenset((
|
||||
(namespaces["html"], "wbr"),
|
||||
(namespaces["html"], "xmp"),
|
||||
(namespaces["svg"], "foreignObject")
|
||||
))
|
||||
])
|
||||
|
||||
htmlIntegrationPointElements = frozenset((
|
||||
htmlIntegrationPointElements = frozenset([
|
||||
(namespaces["mathml"], "annotaion-xml"),
|
||||
(namespaces["svg"], "foreignObject"),
|
||||
(namespaces["svg"], "desc"),
|
||||
(namespaces["svg"], "title")
|
||||
))
|
||||
])
|
||||
|
||||
mathmlTextIntegrationPointElements = frozenset((
|
||||
mathmlTextIntegrationPointElements = frozenset([
|
||||
(namespaces["mathml"], "mi"),
|
||||
(namespaces["mathml"], "mo"),
|
||||
(namespaces["mathml"], "mn"),
|
||||
(namespaces["mathml"], "ms"),
|
||||
(namespaces["mathml"], "mtext")
|
||||
))
|
||||
])
|
||||
|
||||
adjustForeignAttributes = {
|
||||
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
|
||||
@@ -451,21 +449,21 @@ adjustForeignAttributes = {
|
||||
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
|
||||
adjustForeignAttributes.items()])
|
||||
|
||||
spaceCharacters = frozenset((
|
||||
spaceCharacters = frozenset([
|
||||
"\t",
|
||||
"\n",
|
||||
"\u000C",
|
||||
" ",
|
||||
"\r"
|
||||
))
|
||||
])
|
||||
|
||||
tableInsertModeElements = frozenset((
|
||||
tableInsertModeElements = frozenset([
|
||||
"table",
|
||||
"tbody",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tr"
|
||||
))
|
||||
])
|
||||
|
||||
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||
asciiUppercase = frozenset(string.ascii_uppercase)
|
||||
@@ -486,7 +484,7 @@ headingElements = (
|
||||
"h6"
|
||||
)
|
||||
|
||||
voidElements = frozenset((
|
||||
voidElements = frozenset([
|
||||
"base",
|
||||
"command",
|
||||
"event-source",
|
||||
@@ -502,11 +500,11 @@ voidElements = frozenset((
|
||||
"input",
|
||||
"source",
|
||||
"track"
|
||||
))
|
||||
])
|
||||
|
||||
cdataElements = frozenset(('title', 'textarea'))
|
||||
cdataElements = frozenset(['title', 'textarea'])
|
||||
|
||||
rcdataElements = frozenset((
|
||||
rcdataElements = frozenset([
|
||||
'style',
|
||||
'script',
|
||||
'xmp',
|
||||
@@ -514,27 +512,27 @@ rcdataElements = frozenset((
|
||||
'noembed',
|
||||
'noframes',
|
||||
'noscript'
|
||||
))
|
||||
])
|
||||
|
||||
booleanAttributes = {
|
||||
"": frozenset(("irrelevant",)),
|
||||
"style": frozenset(("scoped",)),
|
||||
"img": frozenset(("ismap",)),
|
||||
"audio": frozenset(("autoplay", "controls")),
|
||||
"video": frozenset(("autoplay", "controls")),
|
||||
"script": frozenset(("defer", "async")),
|
||||
"details": frozenset(("open",)),
|
||||
"datagrid": frozenset(("multiple", "disabled")),
|
||||
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
||||
"hr": frozenset(("noshade")),
|
||||
"menu": frozenset(("autosubmit",)),
|
||||
"fieldset": frozenset(("disabled", "readonly")),
|
||||
"option": frozenset(("disabled", "readonly", "selected")),
|
||||
"optgroup": frozenset(("disabled", "readonly")),
|
||||
"button": frozenset(("disabled", "autofocus")),
|
||||
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
||||
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
||||
"output": frozenset(("disabled", "readonly")),
|
||||
"": frozenset(["irrelevant"]),
|
||||
"style": frozenset(["scoped"]),
|
||||
"img": frozenset(["ismap"]),
|
||||
"audio": frozenset(["autoplay", "controls"]),
|
||||
"video": frozenset(["autoplay", "controls"]),
|
||||
"script": frozenset(["defer", "async"]),
|
||||
"details": frozenset(["open"]),
|
||||
"datagrid": frozenset(["multiple", "disabled"]),
|
||||
"command": frozenset(["hidden", "disabled", "checked", "default"]),
|
||||
"hr": frozenset(["noshade"]),
|
||||
"menu": frozenset(["autosubmit"]),
|
||||
"fieldset": frozenset(["disabled", "readonly"]),
|
||||
"option": frozenset(["disabled", "readonly", "selected"]),
|
||||
"optgroup": frozenset(["disabled", "readonly"]),
|
||||
"button": frozenset(["disabled", "autofocus"]),
|
||||
"input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
|
||||
"select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
|
||||
"output": frozenset(["disabled", "readonly"]),
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||
@@ -574,7 +572,7 @@ entitiesWindows1252 = (
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
)
|
||||
|
||||
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
|
||||
xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
|
||||
|
||||
entities = {
|
||||
"AElig": "\xc6",
|
||||
@@ -3088,8 +3086,8 @@ tokenTypes = {
|
||||
"ParseError": 7
|
||||
}
|
||||
|
||||
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]))
|
||||
tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]])
|
||||
|
||||
|
||||
prefixes = dict([(v, k) for k, v in namespaces.items()])
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
from . import _base
|
||||
from ..constants import cdataElements, rcdataElements, voidElements
|
||||
|
||||
@@ -23,24 +20,24 @@ class Filter(_base.Filter):
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
|
||||
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
|
||||
if not isinstance(name, str):
|
||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||
if not name:
|
||||
raise LintError(_("Empty tag name"))
|
||||
raise LintError("Empty tag name")
|
||||
if type == "StartTag" and name in voidElements:
|
||||
raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
|
||||
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
|
||||
elif type == "EmptyTag" and name not in voidElements:
|
||||
raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
|
||||
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
|
||||
if type == "StartTag":
|
||||
open_elements.append(name)
|
||||
for name, value in token["data"]:
|
||||
if not isinstance(name, str):
|
||||
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
|
||||
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
|
||||
if not name:
|
||||
raise LintError(_("Empty attribute name"))
|
||||
raise LintError("Empty attribute name")
|
||||
if not isinstance(value, str):
|
||||
raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
|
||||
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
|
||||
if name in cdataElements:
|
||||
contentModelFlag = "CDATA"
|
||||
elif name in rcdataElements:
|
||||
@@ -51,43 +48,43 @@ class Filter(_base.Filter):
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if not isinstance(name, str):
|
||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||
if not name:
|
||||
raise LintError(_("Empty tag name"))
|
||||
raise LintError("Empty tag name")
|
||||
if name in voidElements:
|
||||
raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
|
||||
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
|
||||
start_name = open_elements.pop()
|
||||
if start_name != name:
|
||||
raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
|
||||
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
|
||||
contentModelFlag = "PCDATA"
|
||||
|
||||
elif type == "Comment":
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
||||
raise LintError("Comment not in PCDATA content model flag")
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
if not isinstance(data, str):
|
||||
raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
|
||||
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
|
||||
if not data:
|
||||
raise LintError(_("%(type)s token with empty data") % {"type": type})
|
||||
raise LintError("%(type)s token with empty data" % {"type": type})
|
||||
if type == "SpaceCharacters":
|
||||
data = data.strip(spaceCharacters)
|
||||
if data:
|
||||
raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
|
||||
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
|
||||
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
|
||||
if not isinstance(name, str):
|
||||
raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
|
||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||
# XXX: what to do with token["data"] ?
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise LintError(_("Unknown token type: %(type)s") % {"type": type})
|
||||
raise LintError("Unknown token type: %(type)s" % {"type": type})
|
||||
|
||||
yield token
|
||||
|
||||
@@ -18,6 +18,7 @@ from .constants import cdataElements, rcdataElements
|
||||
from .constants import tokenTypes, ReparseException, namespaces
|
||||
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
|
||||
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
|
||||
from .constants import E
|
||||
|
||||
|
||||
def parse(doc, treebuilder="etree", encoding=None,
|
||||
@@ -129,6 +130,17 @@ class HTMLParser(object):
|
||||
|
||||
self.framesetOK = True
|
||||
|
||||
@property
|
||||
def documentEncoding(self):
|
||||
"""The name of the character encoding
|
||||
that was used to decode the input stream,
|
||||
or :obj:`None` if that is not determined yet.
|
||||
|
||||
"""
|
||||
if not hasattr(self, 'tokenizer'):
|
||||
return None
|
||||
return self.tokenizer.stream.charEncoding[0]
|
||||
|
||||
def isHTMLIntegrationPoint(self, element):
|
||||
if (element.name == "annotation-xml" and
|
||||
element.namespace == namespaces["mathml"]):
|
||||
@@ -245,7 +257,7 @@ class HTMLParser(object):
|
||||
# XXX The idea is to make errorcode mandatory.
|
||||
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
|
||||
if self.strict:
|
||||
raise ParseError
|
||||
raise ParseError(E[errorcode] % datavars)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
""" HTML5 specific normalizations to the token stream """
|
||||
@@ -868,7 +880,7 @@ def getPhases(debug):
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("base", "basefont", "bgsound", "command", "link", "meta",
|
||||
"noframes", "script", "style", "title"),
|
||||
"script", "style", "title"),
|
||||
self.startTagProcessInHead),
|
||||
("body", self.startTagBody),
|
||||
("frameset", self.startTagFrameset),
|
||||
@@ -1205,8 +1217,7 @@ def getPhases(debug):
|
||||
attributes["name"] = "isindex"
|
||||
self.processStartTag(impliedTagToken("input", "StartTag",
|
||||
attributes=attributes,
|
||||
selfClosing=
|
||||
token["selfClosing"]))
|
||||
selfClosing=token["selfClosing"]))
|
||||
self.processEndTag(impliedTagToken("label"))
|
||||
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
||||
self.processEndTag(impliedTagToken("form"))
|
||||
|
||||
@@ -28,7 +28,18 @@ asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
|
||||
|
||||
if utils.supports_lone_surrogates:
|
||||
# Use one extra step of indirection and create surrogates with
|
||||
# unichr. Not using this indirection would introduce an illegal
|
||||
# unicode literal on platforms not supporting such lone
|
||||
# surrogates.
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
|
||||
eval('"\\uD800-\\uDFFF"'))
|
||||
else:
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||
|
||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
@@ -164,13 +175,18 @@ class HTMLUnicodeInputStream(object):
|
||||
|
||||
"""
|
||||
|
||||
# Craziness
|
||||
if len("\U0010FFFF") == 1:
|
||||
if not utils.supports_lone_surrogates:
|
||||
# Such platforms will have already checked for such
|
||||
# surrogate errors, so no need to do this checking.
|
||||
self.reportCharacterErrors = None
|
||||
self.replaceCharactersRegexp = None
|
||||
elif len("\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
|
||||
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
|
||||
self.replaceCharactersRegexp = re.compile(
|
||||
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
@@ -265,11 +281,12 @@ class HTMLUnicodeInputStream(object):
|
||||
self._bufferedCharacter = data[-1]
|
||||
data = data[:-1]
|
||||
|
||||
self.reportCharacterErrors(data)
|
||||
if self.reportCharacterErrors:
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
@@ -2,11 +2,26 @@ from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
from six.moves import urllib_parse as urlparse
|
||||
|
||||
from .tokenizer import HTMLTokenizer
|
||||
from .constants import tokenTypes
|
||||
|
||||
|
||||
content_type_rgx = re.compile(r'''
|
||||
^
|
||||
# Match a content type <application>/<type>
|
||||
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||
# Match any character set and encoding
|
||||
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||
# Assume the rest is data
|
||||
,.*
|
||||
$
|
||||
''',
|
||||
re.VERBOSE)
|
||||
|
||||
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
@@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object):
|
||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
||||
'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
|
||||
'xlink:href', 'xml:base']
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
|
||||
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
|
||||
|
||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
||||
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
|
||||
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs']
|
||||
'ssh', 'sftp', 'rtsp', 'afs', 'data']
|
||||
|
||||
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
|
||||
allowed_css_keywords = acceptable_css_keywords
|
||||
allowed_svg_properties = acceptable_svg_properties
|
||||
allowed_protocols = acceptable_protocols
|
||||
allowed_content_types = acceptable_content_types
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
@@ -189,10 +207,17 @@ class HTMLSanitizerMixin(object):
|
||||
unescape(attrs[attr])).lower()
|
||||
# remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
|
||||
(val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
uri = urlparse.urlparse(val_unescaped)
|
||||
if uri and uri.scheme:
|
||||
if uri.scheme not in self.allowed_protocols:
|
||||
del attrs[attr]
|
||||
if uri.scheme == 'data':
|
||||
m = content_type_rgx.match(uri.path)
|
||||
if not m:
|
||||
del attrs[attr]
|
||||
elif m.group('content_type') not in self.allowed_content_types:
|
||||
del attrs[attr]
|
||||
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
@@ -245,7 +270,7 @@ class HTMLSanitizerMixin(object):
|
||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if not keyword in self.acceptable_css_keywords and \
|
||||
if keyword not in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
|
||||
break
|
||||
else:
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
try:
|
||||
from functools import reduce
|
||||
except ImportError:
|
||||
@@ -35,7 +32,7 @@ else:
|
||||
v = utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
v = ord(v)
|
||||
if not v in encode_entity_map or k.islower():
|
||||
if v not in encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
|
||||
@@ -208,7 +205,7 @@ class HTMLSerializer(object):
|
||||
if token["systemId"]:
|
||||
if token["systemId"].find('"') >= 0:
|
||||
if token["systemId"].find("'") >= 0:
|
||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
||||
self.serializeError("System identifer contains both single and double quote characters")
|
||||
quote_char = "'"
|
||||
else:
|
||||
quote_char = '"'
|
||||
@@ -220,7 +217,7 @@ class HTMLSerializer(object):
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError(_("Unexpected </ in CDATA"))
|
||||
self.serializeError("Unexpected </ in CDATA")
|
||||
yield self.encode(token["data"])
|
||||
else:
|
||||
yield self.encode(escape(token["data"]))
|
||||
@@ -231,7 +228,7 @@ class HTMLSerializer(object):
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
||||
# TODO: Add namespace support here
|
||||
k = attr_name
|
||||
@@ -279,20 +276,20 @@ class HTMLSerializer(object):
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
yield self.encodeStrict("</%s>" % name)
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError(_("Comment contains --"))
|
||||
self.serializeError("Comment contains --")
|
||||
yield self.encodeStrict("<!--%s-->" % token["data"])
|
||||
|
||||
elif type == "Entity":
|
||||
name = token["name"]
|
||||
key = name + ";"
|
||||
if not key in entities:
|
||||
self.serializeError(_("Entity %s not recognized" % name))
|
||||
if key not in entities:
|
||||
self.serializeError("Entity %s not recognized" % name)
|
||||
if self.resolve_entities and key not in xmlEntities:
|
||||
data = entities[key]
|
||||
else:
|
||||
|
||||
@@ -158,7 +158,7 @@ def getDomBuilder(DomImplementation):
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
||||
if Node.TEXT_NODE not in self.dom._child_node_types:
|
||||
self.dom._child_node_types = list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
self.dom.appendChild(self.dom.createTextNode(data))
|
||||
|
||||
@@ -10,8 +10,12 @@ returning an iterator generating tokens.
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree",
|
||||
"pulldom"]
|
||||
|
||||
import sys
|
||||
|
||||
from .. import constants
|
||||
from ..utils import default_etree
|
||||
|
||||
treeWalkerCache = {}
|
||||
@@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
||||
|
||||
|
||||
def concatenateCharacterTokens(tokens):
|
||||
pendingCharacters = []
|
||||
for token in tokens:
|
||||
type = token["type"]
|
||||
if type in ("Characters", "SpaceCharacters"):
|
||||
pendingCharacters.append(token["data"])
|
||||
else:
|
||||
if pendingCharacters:
|
||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||
pendingCharacters = []
|
||||
yield token
|
||||
if pendingCharacters:
|
||||
yield {"type": "Characters", "data": "".join(pendingCharacters)}
|
||||
|
||||
|
||||
def pprint(walker):
|
||||
"""Pretty printer for tree walkers"""
|
||||
output = []
|
||||
indent = 0
|
||||
for token in concatenateCharacterTokens(walker):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# tag name
|
||||
if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
|
||||
if token["namespace"] in constants.prefixes:
|
||||
ns = constants.prefixes[token["namespace"]]
|
||||
else:
|
||||
ns = token["namespace"]
|
||||
name = "%s %s" % (ns, token["name"])
|
||||
else:
|
||||
name = token["name"]
|
||||
output.append("%s<%s>" % (" " * indent, name))
|
||||
indent += 2
|
||||
# attributes (sorted for consistent ordering)
|
||||
attrs = token["data"]
|
||||
for (namespace, localname), value in sorted(attrs.items()):
|
||||
if namespace:
|
||||
if namespace in constants.prefixes:
|
||||
ns = constants.prefixes[namespace]
|
||||
else:
|
||||
ns = namespace
|
||||
name = "%s %s" % (ns, localname)
|
||||
else:
|
||||
name = localname
|
||||
output.append("%s%s=\"%s\"" % (" " * indent, name, value))
|
||||
# self-closing
|
||||
if type == "EmptyTag":
|
||||
indent -= 2
|
||||
|
||||
elif type == "EndTag":
|
||||
indent -= 2
|
||||
|
||||
elif type == "Comment":
|
||||
output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
|
||||
|
||||
elif type == "Doctype":
|
||||
if token["name"]:
|
||||
if token["publicId"]:
|
||||
output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(" " * indent,
|
||||
token["name"],
|
||||
token["publicId"],
|
||||
token["systemId"] if token["systemId"] else ""))
|
||||
elif token["systemId"]:
|
||||
output.append("""%s<!DOCTYPE %s "" "%s">""" %
|
||||
(" " * indent,
|
||||
token["name"],
|
||||
token["systemId"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE %s>" % (" " * indent,
|
||||
token["name"]))
|
||||
else:
|
||||
output.append("%s<!DOCTYPE >" % (" " * indent,))
|
||||
|
||||
elif type == "Characters":
|
||||
output.append("%s\"%s\"" % (" " * indent, token["data"]))
|
||||
|
||||
elif type == "SpaceCharacters":
|
||||
assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown token type, %s" % type)
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type, string_types
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
||||
"TreeWalker", "NonRecursiveTreeWalker"]
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
@@ -58,7 +58,7 @@ class TreeWalker(object):
|
||||
"namespace": to_text(namespace),
|
||||
"data": attrs}
|
||||
if hasChildren:
|
||||
yield self.error(_("Void element has children"))
|
||||
yield self.error("Void element has children")
|
||||
|
||||
def startTag(self, namespace, name, attrs):
|
||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||
@@ -122,7 +122,7 @@ class TreeWalker(object):
|
||||
return {"type": "Entity", "name": text_type(name)}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error(_("Unknown node type: ") + nodeType)
|
||||
return self.error("Unknown node type: " + nodeType)
|
||||
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
|
||||
@@ -2,9 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from . import _base
|
||||
|
||||
|
||||
|
||||
@@ -7,12 +7,10 @@ except ImportError:
|
||||
from ordereddict import OrderedDict
|
||||
except ImportError:
|
||||
OrderedDict = dict
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import re
|
||||
|
||||
from six import text_type
|
||||
from six import string_types
|
||||
|
||||
from . import _base
|
||||
from ..utils import moduleFactoryFactory
|
||||
@@ -60,7 +58,7 @@ def getETreeBuilder(ElementTreeImplementation):
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
assert type(node.tag) == text_type, type(node.tag)
|
||||
assert isinstance(node.tag, string_types), type(node.tag)
|
||||
# This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
|
||||
@@ -4,9 +4,6 @@ from six import text_type
|
||||
from lxml import etree
|
||||
from ..treebuilders.etree import tag_regexp
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
from . import _base
|
||||
|
||||
from .. import ihatexml
|
||||
@@ -130,7 +127,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
return _base.TEXT, ensure_str(getattr(node, key))
|
||||
|
||||
elif isinstance(node, Root):
|
||||
@@ -169,7 +166,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
attrs, len(node) > 0 or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
assert not isinstance(node, tuple), _("Text nodes have no children")
|
||||
assert not isinstance(node, tuple), "Text nodes have no children"
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
@@ -180,7 +177,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
if key == "text":
|
||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||
# because node[0] might evaluate to False if it has no child element
|
||||
@@ -196,7 +193,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
if key == "text":
|
||||
return node
|
||||
# else: fallback to "normal" processing
|
||||
|
||||
@@ -2,6 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from types import ModuleType
|
||||
|
||||
from six import text_type
|
||||
|
||||
try:
|
||||
import xml.etree.cElementTree as default_etree
|
||||
except ImportError:
|
||||
@@ -9,7 +11,26 @@ except ImportError:
|
||||
|
||||
|
||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||
"surrogatePairToCodepoint", "moduleFactoryFactory"]
|
||||
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||
"supports_lone_surrogates"]
|
||||
|
||||
|
||||
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||
# caught by the below test. In general this would be any platform
|
||||
# using UTF-16 as its encoding of unicode strings, such as
|
||||
# Jython. This is because UTF-16 itself is based on the use of such
|
||||
# surrogates, and there is no mechanism to further escape such
|
||||
# escapes.
|
||||
try:
|
||||
_x = eval('"\\uD800"')
|
||||
if not isinstance(_x, text_type):
|
||||
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||
_x = eval('u"\\uD800"')
|
||||
assert isinstance(_x, text_type)
|
||||
except:
|
||||
supports_lone_surrogates = False
|
||||
else:
|
||||
supports_lone_surrogates = True
|
||||
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
|
||||
Reference in New Issue
Block a user