PyDL  Version: 0.0.1
pydl.py
Go to the documentation of this file.
1 #!/usr/bin/python3
2 
3 # TODO: audit imports
4 import urllib
5 import urllib.request
6 from urllib.parse import urlparse
7 from urllib.error import URLError, HTTPError
8 from lxml.html import parse
9 from contextlib import closing
10 import shutil, sys, re
11 import signal
12 import time
13 
14 from threading import Thread
15 from queue import Queue
16 import collections
17 
18 DEBUG = False
19 DEBUG_CLEAR = False
20 
21 if DEBUG:
22  from random import randint
23 
24 """
25 # TODO:
26 
27 ## FIX:
28 * 'URLs waiting...' displays negative queue size
29 
30 
31 ## Project
32 * update gitlab
33  * project description
34  * roadmap
35  * tickets
36 * update changelog
37 * merge branches
38  * fix file names
39  * do cleanup
40 * evaluate project against [phoemur/wgetter: Download utility written in python](https://github.com/phoemur/wgetter)
41 * move these notes into gitlab
42 
43 ## Features
44 * add command line options
45  * download priority smallest files first
46  * num threads per site
47  * max total threads
48  * debug toggles
49  * debug messages
50  * randomized test download file sizes
51  * log toggle
52  * output directory
53  * input list
54  * input file name
55 * add debug messages
56 * add log output
57 * thread timeouts
58 * signal handler
59  * SIGKILL
60  * SIGHUP
61  * thread cleanup
62 * output order (downloaded first)
63 * file cleanup
64 * color output
65 * handle other sites
66  * use url description dictionary to add callbacks for links
67 """
68 
69 # TODO: finish signal handler (kill, hup)
70 # * needs to be more robust for threads
71 # * needs to clean up partially downloaded files
73  pass
74 
75 def myHandler(sig, frame):
76  raise MyError('Received signal ' + str(sig) +
77  ' on line ' + str(frame.f_lineno) +
78  ' in ' + frame.f_code.co_filename)
79 
80 signal.signal(signal.SIGINT, myHandler)
81 signal.signal(signal.SIGHUP, myHandler)
82 
83 class Config:
84  """ Configuration parameters
85  1. populate vars
86  3. verify output directory -cwd(?)
87  2. open input file -save descriptor
88  """
89 
90  input_file_name = "infile"
91  ifd = None # input file descriptor
92  output_dir_path = "./"
93  max_threads = 1
94 
95  def __init__(self):
96  pass
97 
98 class ScrapeFile:
99  """ Class to get file
100  """
101 
102  def __init__(self, input_url, download_path, file_id, report):
103  #TODO: handle other site layouts (like sourceforge)
104  purl = urllib.parse.urlparse(input_url)
105  self.filename = purl.path.split('/')[-1]
106 
107  #build the download url
108  download_path_start = purl.scheme + "://" + purl.netloc + download_path + purl.path.split('/')[file_id]
109 
110  #TODO: add try block in caller
111  self.download_path = purl.scheme + "://" + purl.netloc + self.follow(download_path_start)
112 
113  # set the output filename
114  self.ofilename = self.download_path.split('/')[-1]
115 
116  # report class instance reference
117  self.report = report
118 
119  def follow(self, url):
120  """ Follow the url through redirects
121  """
122  while True:
123  with closing(urllib.request.urlopen(url)) as stream:
124  next = parse(stream).xpath("//meta[@http-equiv = 'refresh']/@content")
125  if next:
126  url = next[0].split(";")[1].strip().replace("url=", "")
127  # temp hack return bc of known hop level
128  return url
129  else:
130  return stream.geturl()
131 
132  def chunk_report(self, bytes_so_far, chunk_size, total_size):
133  """ Report the latest downloaded file chunk
134  """
135  self.report.progressX[self.ofilename] = {"bytes_so_far": bytes_so_far, "total_size": total_size}
136 
137 
138  def chunk_read(self, response, ofilename, chunk_size=8192, report_hook=None):
139  total_size = int(response.headers["Content-Length"])
140  bytes_so_far = 0
141  data = []
142 
143  with open(ofilename,'wb') as ofd:
144  while 1:
145  chunk = response.read(chunk_size)
146 
147  if not chunk:
148  break
149 
150  bytes_so_far += len(chunk)
151 
152  if report_hook:
153  report_hook(bytes_so_far, chunk_size, total_size)
154 
155  ofd.write(chunk)
156 
157  def download(self):
158 
159  # just in case we need to look like a browser
160  hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
161  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
162  'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
163  'Accept-Encoding': 'none',
164  'Accept-Language': 'en-US,en;q=0.8',
165  'Connection': 'keep-alive'}
166 
167  self.ofilename = self.download_path.split('/')[-1]
168  req = urllib.request.Request(self.download_path, headers=hdr)
169  response = urllib.request.urlopen(req)
170 
171  self.chunk_read(response, self.ofilename, report_hook=self.chunk_report)
172 
173  def test_thread(self):
174  chunk_size = 8192
175  inc = randint(0, 1024)
176 
177  total_size = 10240
178  bytes_so_far = 0
179 
180  while bytes_so_far < total_size:
181  time.sleep(.5)
182  bytes_so_far += inc
183 
184  #compensate for random number overage
185  if ( bytes_so_far > total_size):
186  bytes_so_far = total_size
187 
188  self.chunk_report(bytes_so_far, chunk_size, total_size)
189 
190 
191 class ThreadProc(Thread):
192 
193  def __init__(self, urlq, tq, dl_site_info, report):
194  Thread.__init__(self)
195  self.urlq = urlq
196  self.tq = tq
197  self.dl_site_info = dl_site_info
198  self.report = report
199 
200  def run(self):
201  try:
202  url = self.urlq.get()
203  while True:
204  if self.urlq == None:
205  break;
206 
207  # setup download_path link portion
208  download_path = None
209  id_pos = None
210 
211  # grab the base download url based on the patterns provided
212  for key, value in dl_site_info.items():
213  if ( key in url ):
214  download_path = value["download_path"]
215  # grab the file id position from the template
216  id_pos = value["id_pos"]
217 
218  # TODO: needs error handling -exit bad
219  if( download_path == None ):
220  print("ERROR: base url of website not configured")
221  # TODO: should probably just continue here
222  sys.exit(1)
223 
224  # create object and do work
225  try:
226  o = ScrapeFile(url, download_path, id_pos, report)
227  report.progressX[o.ofilename] = {"bytes_so_far": 0, "total_size": 0}
228  if DEBUG:
229  o.test_thread()
230  else:
231  o.download()
232  self.tq.get()
233  self.urlq.task_done()
234  break;
235  except urllib.error.HTTPError as err:
236  print("ERROR: %s | HTTPError: %s" % (url, err))
237  continue
238 
239  except MyError as err:
240  print("\n" + "** signal caught: %s **\n" % err)
241  exit(1)
242 
243  #self.tq.task_done()
244 
245 
246 class Report:
247  def __init__(self):
248  self.progressX = collections.OrderedDict()
249 
250  def print_progress(self):
251  if not DEBUG_CLEAR:
252  sys.stdout.write('\033[2J\033[H') #clear screen
253  sys.stdout.flush()
254 
255  for filename, item in self.progressX.items():
256  #FIX: why do i need to test for > 0 ??
257  if(item["total_size"] > 0):
258  percent = float(item["bytes_so_far"] / item["total_size"])
259  bar = ('=' * int(percent * 20)).ljust(20)
260  percent = int(percent * 100)
261  sys.stdout.write("%s [%s] %s%% @ %sMb of %sMb\n" %
262  (filename, bar, percent, int(item["bytes_so_far"] / 1024 / 1024), int(item["total_size"] / 1024 / 1024)))
263  sys.stdout.flush()
264 
265 
266 #################### Main
267 if __name__ == '__main__':
268  """
269  1. create params object and populate configuration
270  1a. use threads...
271  2. loop through input file
272  3. download file from scrape
273  """
274 
275  # TODO: add argv handling
276  # * add output directory
277  # configurables
278  # TODO: This is kind of dumb -probably need another way to do this for url patterns
279  dl_site_info = {
280  # uses format: http://site-A.com/download/13522
281  "site-A": {"download_path": "/download/", "id_pos": -2},
282  # uses format: http://site-B.com/downloadfile/5710
283  "site-B": {"download_path": "/downloadfile/", "id_pos": -1},
284  # uses format: https://sourceforge.net/projects/freemind/files/latest/download?source=directory
285  # TODO: needs it's own scraper
286  "sourceforge": {"download_path": "/projects/", "id_pos": -1}
287  }
288  input_file = "dl.input"
289 
290  # read input data into array
291  inf_arr = []
292  urlq = Queue()
293  #progress = collections.OrderedDict()
294 
295  try:
296  with open(input_file) as ifd:
297  for line in ifd:
298  #if ("#" in line):
299  if ( ("#" in line) or (line == '\n') ):
300  continue
301  inf_arr.append(line.rstrip('\n'))
302  urlq.put(line.rstrip('\n'))
303  except EnvironmentError as err:
304  print("Error: problem with input file | %s" % err)
305  sys.exit(1)
306 
307  # report class
308  report = Report()
309 
310  # thread count queue
311  tq = Queue()
312 
313  ########### process loop start
314  max_threads = 5
315 
316  if DEBUG:
317  print("creating threads | urlq.qsize(): %s" % (urlq.qsize()))
318  time.sleep(1)
319 
320  while not urlq.empty():
321 
322  for i in range(max_threads):
323  if not urlq.empty() and (tq.qsize() < max_threads):
324  # start threads
325  # TODO: catch exceptions and continue download if possible
326  t=ThreadProc(urlq, tq, dl_site_info, report)
327  tq.put(i)
328  t.setDaemon(True)
329  t.start()
330 
331  if DEBUG:
332  print("creating thread: %d | tq.qsize(): %d" % (i,tq.qsize()))
333  time.sleep(1.0)
334 
335 
336  #while any(i.is_alive() for i in workers):
337  while not tq.empty():
338  report.print_progress()
339  print("URLs waiting in queue: %s | Number files downloading: %s" % ((urlq.qsize()), tq.qsize()))
340  time.sleep(0.1)
341 
342  # add more threads if something has finished
343  if not urlq.empty():
344  if tq.qsize() < max_threads:
345  break
346 
347  # print last report if needed
348  report.print_progress()
349 
350  #tq.join()
351  ########### process loop end
def chunk_report(self, bytes_so_far, chunk_size, total_size)
Definition: pydl.py:132
def test_thread(self)
Definition: pydl.py:173
def chunk_read(self, response, ofilename, chunk_size=8192, report_hook=None)
Definition: pydl.py:138
def __init__(self, input_url, download_path, file_id, report)
Definition: pydl.py:102
def download(self)
Definition: pydl.py:157
def __init__(self)
Definition: pydl.py:247
def __init__(self)
Definition: pydl.py:95
def print_progress(self)
Definition: pydl.py:250
def follow(self, url)
Definition: pydl.py:119
def __init__(self, urlq, tq, dl_site_info, report)
Definition: pydl.py:193
def run(self)
Definition: pydl.py:200
def myHandler(sig, frame)
Definition: pydl.py:75