6 from urllib.parse
import urlparse
7 from urllib.error
import URLError, HTTPError
8 from lxml.html
import parse
9 from contextlib
import closing
10 import shutil, sys, re
14 from threading
import Thread
15 from queue
import Queue
22 from random
import randint
28 * 'URLs waiting...' displays negative queue size 40 * evaluate project against [phoemur/wgetter: Download utility written in python](https://github.com/phoemur/wgetter) 41 * move these notes into gitlab 44 * add command line options 45 * download priority smallest files first 46 * num threads per site 50 * randomized test download file sizes 62 * output order (downloaded first) 66 * use url description dictionary to add callbacks for links 76 raise MyError(
'Received signal ' + str(sig) +
77 ' on line ' + str(frame.f_lineno) +
78 ' in ' + frame.f_code.co_filename)
80 signal.signal(signal.SIGINT, myHandler)
81 signal.signal(signal.SIGHUP, myHandler)
84 """ Configuration parameters 86 3. verify output directory -cwd(?) 87 2. open input file -save descriptor 90 input_file_name =
"infile" 92 output_dir_path =
"./" 102 def __init__(self, input_url, download_path, file_id, report):
104 purl = urllib.parse.urlparse(input_url)
108 download_path_start = purl.scheme +
"://" + purl.netloc + download_path + purl.path.split(
'/')[file_id]
120 """ Follow the url through redirects 123 with closing(urllib.request.urlopen(url))
as stream:
124 next = parse(stream).xpath(
"//meta[@http-equiv = 'refresh']/@content")
126 url = next[0].split(
";")[1].strip().replace(
"url=",
"")
130 return stream.geturl()
133 """ Report the latest downloaded file chunk 135 self.report.progressX[self.
ofilename] = {
"bytes_so_far": bytes_so_far,
"total_size": total_size}
138 def chunk_read(self, response, ofilename, chunk_size=8192, report_hook=None):
139 total_size = int(response.headers[
"Content-Length"])
143 with open(ofilename,
'wb')
as ofd:
145 chunk = response.read(chunk_size)
150 bytes_so_far += len(chunk)
153 report_hook(bytes_so_far, chunk_size, total_size)
160 hdr = {
'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
161 'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
162 'Accept-Charset':
'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
163 'Accept-Encoding':
'none',
164 'Accept-Language':
'en-US,en;q=0.8',
165 'Connection':
'keep-alive'}
167 self.
ofilename = self.download_path.split(
'/')[-1]
168 req = urllib.request.Request(self.
download_path, headers=hdr)
169 response = urllib.request.urlopen(req)
175 inc = randint(0, 1024)
180 while bytes_so_far < total_size:
185 if ( bytes_so_far > total_size):
186 bytes_so_far = total_size
193 def __init__(self, urlq, tq, dl_site_info, report):
194 Thread.__init__(self)
202 url = self.urlq.get()
204 if self.
urlq ==
None:
212 for key, value
in dl_site_info.items():
214 download_path = value[
"download_path"]
216 id_pos = value[
"id_pos"]
219 if( download_path ==
None ):
220 print(
"ERROR: base url of website not configured")
226 o =
ScrapeFile(url, download_path, id_pos, report)
227 report.progressX[o.ofilename] = {
"bytes_so_far": 0,
"total_size": 0}
233 self.urlq.task_done()
235 except urllib.error.HTTPError
as err:
236 print(
"ERROR: %s | HTTPError: %s" % (url, err))
239 except MyError
as err:
240 print(
"\n" +
"** signal caught: %s **\n" % err)
252 sys.stdout.write(
'\033[2J\033[H')
255 for filename, item
in self.progressX.items():
257 if(item[
"total_size"] > 0):
258 percent = float(item[
"bytes_so_far"] / item[
"total_size"])
259 bar = (
'=' * int(percent * 20)).ljust(20)
260 percent = int(percent * 100)
261 sys.stdout.write(
"%s [%s] %s%% @ %sMb of %sMb\n" %
262 (filename, bar, percent, int(item[
"bytes_so_far"] / 1024 / 1024), int(item[
"total_size"] / 1024 / 1024)))
267 if __name__ ==
'__main__':
269 1. create params object and populate configuration 271 2. loop through input file 272 3. download file from scrape 281 "site-A": {
"download_path":
"/download/",
"id_pos": -2},
283 "site-B": {
"download_path":
"/downloadfile/",
"id_pos": -1},
286 "sourceforge": {
"download_path":
"/projects/",
"id_pos": -1}
288 input_file =
"dl.input" 296 with open(input_file)
as ifd:
299 if ( (
"#" in line)
or (line ==
'\n') ):
301 inf_arr.append(line.rstrip(
'\n'))
302 urlq.put(line.rstrip(
'\n'))
303 except EnvironmentError
as err:
304 print(
"Error: problem with input file | %s" % err)
317 print(
"creating threads | urlq.qsize(): %s" % (urlq.qsize()))
320 while not urlq.empty():
322 for i
in range(max_threads):
323 if not urlq.empty()
and (tq.qsize() < max_threads):
332 print(
"creating thread: %d | tq.qsize(): %d" % (i,tq.qsize()))
337 while not tq.empty():
338 report.print_progress()
339 print(
"URLs waiting in queue: %s | Number files downloading: %s" % ((urlq.qsize()), tq.qsize()))
344 if tq.qsize() < max_threads:
348 report.print_progress()
def chunk_report(self, bytes_so_far, chunk_size, total_size)
def chunk_read(self, response, ofilename, chunk_size=8192, report_hook=None)
def __init__(self, input_url, download_path, file_id, report)
def __init__(self, urlq, tq, dl_site_info, report)
def myHandler(sig, frame)