twsdownloader.py - twsdownloader (master)

Tree @master (Download .tar.gz)

twsdownloader.py @master — raw · history · blame

"""Download videos from snowboarding.transworld.net"""
import os
import sys
import logging
import argparse

import requests
import lxml.html
import slugify
import pytube
import vimeo_dl


logging.basicConfig(stream=sys.stdout, level=logging.INFO)

ROOT = "https://www.snowboarder.com/videos"
PAGE_XPATH = '//article/figure/a'
VIDEO_XPATH = '//iframe'
TITLE_XPATH = '//h1[@class="post-title"]'
DOWNLOAD_DIR = os.path.expanduser("~/videot/snowboarder.com/")
DOWNLOADED = os.path.expanduser("~/videot/snowboarder.com/downloaded")


def get_pages():
    """Get links to articles on /videos page."""
    response = requests.get(ROOT)
    tree = lxml.html.fromstring(response.content)
    articles = tree.xpath(PAGE_XPATH)

    urls = []
    for article in articles:
        urls.append(article.get('href'))

    return urls


def get_title(site_url):
    """Find the title of the video"""
    response = requests.get(site_url)
    tree = lxml.html.fromstring(response.content)
    titles = tree.xpath(TITLE_XPATH)

    if len(titles) != 1:
        logging.warning('Multiple titles found from %s', site_url)

    return titles[0].text.strip()


def get_stream_url(site_url):
    """Find youtube link from page."""
    response = requests.get(site_url)
    tree = lxml.html.fromstring(response.content)
    iframes = tree.xpath(VIDEO_XPATH)

    if len(iframes) < 1:
        logging.warning('Video not found from %s', site_url)
        return None

    return iframes[0].get('src')


def download_stream(name, url):
    """Download video if does not exist alredy"""
    logging.info('Downloading %s to %s', url, name)
    if "youtu" in url:
        pytube.YouTube(url).streams.first().download(
            output_path=DOWNLOAD_DIR,
            filename=name
        )

    elif "vimeo" in url:
        # "video/" must be removed to avoid HTTPError
        url = url.replace('.com/video/', '.com/')
        stream = vimeo_dl.new(url).streams[0]
        filepath = os.path.join(DOWNLOAD_DIR, '{}.{}'.format(name, stream.extension))
        stream.download(filepath=filepath)
    else:
        raise Exception('Unknown video source: %s' % url)


def mark_downloaded(title):
    """Mark file as downloaded"""
    with open(DOWNLOADED, 'a') as outfile:
        outfile.writelines(title + "\n")


def read_downloaded():
    """Read list of downloaded files"""
    if os.path.isfile(DOWNLOADED):
        with open(DOWNLOADED, 'r') as infile:
            downloaded = infile.read().splitlines()
    else:
        downloaded = []

    return downloaded


def is_downloaded(name, downloaded):
    """Return ``True`` if video is already downloaded"""
    if name in downloaded:
        return True

    return False


def skip_page(name, url):
    """Write page url to a file"""
    with open(os.path.join(DOWNLOAD_DIR, name), 'w+') as input_file:
        input_file.write(url)
        input_file.write("\n")


def download_all():
    """Download all new videos"""
    pages = get_pages()
    downloaded = read_downloaded()

    for page in pages:

        title = get_title(page)
        logging.info(title)

        filename = slugify.slugify(title)
        if is_downloaded(filename, downloaded):
            logging.info('%s is already downloaded', filename)
            continue

        stream_url = get_stream_url(page)
        if not stream_url:
            skip_page(filename, page)
            continue
        try:
            download_stream(filename, stream_url)
            mark_downloaded(filename)
        except Exception as exception:
            logging.exception(exception)
            skip_page(filename, page)


def remove_titles(titles):
    """Mark title(s) downloaded an remove url file(s)"""
    for title in titles:
        mark_downloaded(title)
        os.remove(title)


def main():
    """Main function."""
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--remove', nargs='+', required=False)
    args = parser.parse_args()

    if args.remove:
        remove_titles(args.remove)
    else:
        download_all()


if __name__ == "__main__":
    main()