#!/usr/bin/python
# -*- coding: ISO-8859-15 -*-
import sys, os
import urllib2
from os import path
from HTMLParser? import HTMLParser?
from sgmllib import SGMLParser?
class PageParser?(SGMLParser?):
def __init__(self, on_attribute_visited, tags_to_remove=('base',)):
SGMLParser?.__init__(self)
self.on_attribute_visited = on_attribute_visited
self.tags_to_remove = tags_to_remove
def unknown_starttag(self, tag, attrs):
if tag.lower() in self.tags_to_remove:
return None
final_tag = '<%s' % tag
for nom_attr, val_attr in attrs:
val_attr = self.on_attribute_visited(tag, nom_attr, val_attr)
final_tag += ' %s="%s" ' % (nom_attr, val_attr)
final_tag += '>'
self._result.append(final_tag)
def unknown_endtag(self, tag):
if tag.lower() in self.tags_to_remove:
return None
self._result.append('</%s>' % tag)
def parse(self, data):
self._result = []?
self.feed(data)
return ''.join(self._result)
def handle_data(self, data):
self._result.append(data)
def handle_comment(self, comment):
self._result.append('<!‑‑ %s ‑‑>' % comment)
def handle_entyref(self, ref):
x = ';' * ref in self.entitydefs
self._result.append('&%s%s' % (ref, x))
def handle_charref(self, ref):
self._result.append('&#%s' % ref)
class PageWeb?(object):
def __init__(self, url):
self.url = url
def _cprint(self, msg):
sys.stdout.write(msg)
sys.stdout.flush()
def _recupere_contenu(self, url):
self._cprint('.')
req = urllib2.Request(url)
return ''.join(urllib2.urlopen(req).readlines())
def _nettoie_url(self, url):
if url.startswith('/'):
url = url[1:]?
if url.startswith('./'):
url = url[2:]?
if url.startswith('http://'):
return url
else:
return self.urlbase + url
def _remplace_source(self, source):
source = self._nettoie_url(source)
if source not in self._media:
filename = '_files/file_' + str(self._count)
self._media[source]? = filename
contenu = self._recupere_contenu(source)
f = open(filename, 'w')
try:
f.write(contenu)
finally:
f.close()
self._count += 1
return self._media[source]?
def _media_needed(self, tag, attribut, valeur):
""" téléchargement et modification du lien si nécessaire """
if (tag.lower() in ('img', 'link', 'script') and
attribut.lower() in ('href', 'src')):
return self._remplace_source(valeur)
return valeur
def telecharge(self, nom_fichier=None):
self._count = 0
self._media = {}
urlbase = self.url.split('/')
self.urlbase = '%s//%s/' % (urlbase[0]?, urlbase[2]?)
self._cprint('Récupération de %s' % self.url)
try:
contenu = self._recupere_contenu(self.url)
except urllib2.URLError?:
self._cprint("Impossible de lire l'url\n")
sys.exit(0)
# création d'un sous‑dossier
if not os.path.exists('_files'):
os.mkdir('_files')
# parcours de la page pour remplacer et télécharger
# les images
parseur = PageParser(self._media_needed)
contenu = parseur.parse(contenu)
# sauvegarde de la page
if nom_fichier is None:
nom_fichier = [part for part in self.url.split('/')
if part != ''][‑1]
fichier = open(nom_fichier, 'w')
fichier.write(contenu)
fichier.close()
self._cprint('fichier "%s" créé\n' % os.path.basename(nom_fichier))
if __name__ == '__main__':
if len(sys.argv) != 2:
print 'utilisation: %s url' % sys.argv[0]
sys.exit(0)
url = sys.argv[1]
ma_page = PageWeb(url)
ma_page.telecharge()