"""Download videos from snowboarding.transworld.net"""
import os
import sys
import logging
import argparse
import requests
import lxml.html
import slugify
import pytube
import vimeo_dl
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
ROOT = "https://www.snowboarder.com/videos"
PAGE_XPATH = '//article/figure/a'
VIDEO_XPATH = '//iframe'
TITLE_XPATH = '//h1[@class="post-title"]'
DOWNLOAD_DIR = os.path.expanduser("~/videot/snowboarder.com/")
DOWNLOADED = os.path.expanduser("~/videot/snowboarder.com/downloaded")
def get_pages():
"""Get links to articles on /videos page."""
response = requests.get(ROOT)
tree = lxml.html.fromstring(response.content)
articles = tree.xpath(PAGE_XPATH)
urls = []
for article in articles:
urls.append(article.get('href'))
return urls
def get_title(site_url):
"""Find the title of the video"""
response = requests.get(site_url)
tree = lxml.html.fromstring(response.content)
titles = tree.xpath(TITLE_XPATH)
if len(titles) != 1:
logging.warning('Multiple titles found from %s', site_url)
return titles[0].text.strip()
def get_stream_url(site_url):
"""Find youtube link from page."""
response = requests.get(site_url)
tree = lxml.html.fromstring(response.content)
iframes = tree.xpath(VIDEO_XPATH)
if len(iframes) < 1:
logging.warning('Video not found from %s', site_url)
return None
return iframes[0].get('src')
def download_stream(name, url):
"""Download video if does not exist alredy"""
logging.info('Downloading %s to %s', url, name)
if "youtu" in url:
pytube.YouTube(url).streams.first().download(
output_path=DOWNLOAD_DIR,
filename=name
)
elif "vimeo" in url:
# "video/" must be removed to avoid HTTPError
url = url.replace('.com/video/', '.com/')
stream = vimeo_dl.new(url).streams[0]
filepath = os.path.join(DOWNLOAD_DIR, '{}.{}'.format(name, stream.extension))
stream.download(filepath=filepath)
else:
raise Exception('Unknown video source: %s' % url)
def mark_downloaded(title):
"""Mark file as downloaded"""
with open(DOWNLOADED, 'a') as outfile:
outfile.writelines(title + "\n")
def read_downloaded():
"""Read list of downloaded files"""
if os.path.isfile(DOWNLOADED):
with open(DOWNLOADED, 'r') as infile:
downloaded = infile.read().splitlines()
else:
downloaded = []
return downloaded
def is_downloaded(name, downloaded):
"""Return ``True`` if video is already downloaded"""
if name in downloaded:
return True
return False
def skip_page(name, url):
"""Write page url to a file"""
with open(os.path.join(DOWNLOAD_DIR, name), 'w+') as input_file:
input_file.write(url)
input_file.write("\n")
def download_all():
"""Download all new videos"""
pages = get_pages()
downloaded = read_downloaded()
for page in pages:
title = get_title(page)
logging.info(title)
filename = slugify.slugify(title)
if is_downloaded(filename, downloaded):
logging.info('%s is already downloaded', filename)
continue
stream_url = get_stream_url(page)
if not stream_url:
skip_page(filename, page)
continue
try:
download_stream(filename, stream_url)
mark_downloaded(filename)
except Exception as exception:
logging.exception(exception)
skip_page(filename, page)
def remove_titles(titles):
"""Mark title(s) downloaded an remove url file(s)"""
for title in titles:
mark_downloaded(title)
os.remove(title)
def main():
"""Main function."""
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--remove', nargs='+', required=False)
args = parser.parse_args()
if args.remove:
remove_titles(args.remove)
else:
download_all()
if __name__ == "__main__":
main()