From ee26a748c6cacfcaf3f9caa872537f1d5a0b466a Mon Sep 17 00:00:00 2001 From: LukasHedegaard Date: Fri, 22 May 2020 09:09:16 +0200 Subject: [PATCH 1/6] Update environment.yml --- Crawler/Kinetics/environment.yml | 58 ++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/Crawler/Kinetics/environment.yml b/Crawler/Kinetics/environment.yml index cc6eadb..0fc1595 100755 --- a/Crawler/Kinetics/environment.yml +++ b/Crawler/Kinetics/environment.yml @@ -1,28 +1,36 @@ name: kinetics -channels: !!python/tuple -- !!python/unicode - 'defaults' +channels: + - anaconda + - menpo + - defaults dependencies: -- joblib=0.9.4=py27_0 -- menpo::ffmpeg=3.1.3=0 -- mkl=2017.0.1=0 -- numpy=1.12.1=py27_0 -- openssl=1.0.2k=1 -- pandas=0.19.2=np112py27_1 -- pip=9.0.1=py27_1 -- python=2.7.13=0 -- python-dateutil=2.6.0=py27_0 -- pytz=2017.2=py27_0 -- readline=6.2=2 -- setuptools=27.2.0=py27_0 -- six=1.10.0=py27_0 -- sqlite=3.13.0=0 -- tk=8.5.18=0 -- wheel=0.29.0=py27_0 -- zlib=1.2.8=3 -- pip: - - decorator==4.0.11 - - olefile==0.44 - - youtube-dl==2017.6.5 -prefix: /home/cabaf/.conda/envs/kinetics + - ca-certificates=2020.1.1 + - certifi=2020.4.5.1 + - ffmpeg=2.8.6 + - libcxx=10.0.0 + - libedit=3.1.20181209 + - libffi=3.3 + - ncurses=6.2 + - openssl=1.1.1g + - pip=20.0.2 + - python=3.7.7 + - readline=8.0 + - setuptools=46.4.0 + - sqlite=3.31.1 + - tk=8.6.8 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - decorator==4.4.2 + - intel-openmp==2019.0 + - joblib==0.15.1 + - mkl==2019.0 + - numpy==1.18.4 + - olefile==0.46 + - pandas==1.0.3 + - python-dateutil==2.8.1 + - pytz==2020.1 + - six==1.14.0 + - youtube-dl==2020.5.8 From 27a53b52b654e3f6490a0530986fabb37c622cf9 Mon Sep 17 00:00:00 2001 From: Lukas Hedegaard Date: Mon, 25 May 2020 08:40:10 +0200 Subject: [PATCH 2/6] Update environment.yml with condo-forge channel --- Crawler/Kinetics/environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/Crawler/Kinetics/environment.yml b/Crawler/Kinetics/environment.yml index 0fc1595..3285a56 100755 --- a/Crawler/Kinetics/environment.yml +++ b/Crawler/Kinetics/environment.yml @@ -2,6 +2,7 @@ name: kinetics channels: - anaconda - menpo + - conda-forge - defaults dependencies: - ca-certificates=2020.1.1 From 3da2642afd9498f00ebc9e8366a67d7581769929 Mon Sep 17 00:00:00 2001 From: LukasHedegaard Date: Tue, 26 May 2020 07:59:42 +0000 Subject: [PATCH 3/6] Update kinetics crawler to omit existing videos --- Crawler/Kinetics/download.py | 65 +++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/Crawler/Kinetics/download.py b/Crawler/Kinetics/download.py index c8b2cdc..de013e5 100755 --- a/Crawler/Kinetics/download.py +++ b/Crawler/Kinetics/download.py @@ -77,40 +77,43 @@ def download_clip(video_identifier, output_filename, # Construct command line for getting the direct video link. tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4()) - command = ['youtube-dl', - '--quiet', '--no-warnings', - '-f', 'mp4', - '-o', '"%s"' % tmp_filename, - '"%s"' % (url_base + video_identifier)] - command = ' '.join(command) - attempts = 0 - while True: + + if not os.path.exists(output_filename): + if not os.path.exists(tmp_filename): + command = ['youtube-dl', + '--quiet', '--no-warnings', + '-f', 'mp4', + '-o', '"%s"' % tmp_filename, + '"%s"' % (url_base + video_identifier)] + command = ' '.join(command) + attempts = 0 + while True: + try: + output = subprocess.check_output(command, shell=True, + stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as err: + attempts += 1 + if attempts == num_attempts: + return status, err.output + else: + break + + tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] + # Construct command to trim the videos (ffmpeg required). + command = ['ffmpeg', + '-i', '"%s"' % tmp_filename, + '-ss', str(start_time), + '-t', str(end_time - start_time), + '-c:v', 'libx264', '-c:a', 'copy', + '-threads', '1', + '-loglevel', 'panic', + '"%s"' % output_filename] + command = ' '.join(command) try: output = subprocess.check_output(command, shell=True, - stderr=subprocess.STDOUT) + stderr=subprocess.STDOUT) except subprocess.CalledProcessError as err: - attempts += 1 - if attempts == num_attempts: - return status, err.output - else: - break - - tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] - # Construct command to trim the videos (ffmpeg required). - command = ['ffmpeg', - '-i', '"%s"' % tmp_filename, - '-ss', str(start_time), - '-t', str(end_time - start_time), - '-c:v', 'libx264', '-c:a', 'copy', - '-threads', '1', - '-loglevel', 'panic', - '"%s"' % output_filename] - command = ' '.join(command) - try: - output = subprocess.check_output(command, shell=True, - stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as err: - return status, err.output + return status, err.output # Check if the video was successfully saved. status = os.path.exists(output_filename) From 4e3406c1b59d5bb339bd9fb60dc6942a76f1d38d Mon Sep 17 00:00:00 2001 From: LukasHedegaard Date: Fri, 25 Sep 2020 13:19:36 +0000 Subject: [PATCH 4/6] Add redownload mode --- Crawler/Kinetics/download.py | 120 ++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 45 deletions(-) diff --git a/Crawler/Kinetics/download.py b/Crawler/Kinetics/download.py index de013e5..7444841 100755 --- a/Crawler/Kinetics/download.py +++ b/Crawler/Kinetics/download.py @@ -11,6 +11,13 @@ from joblib import Parallel import pandas as pd +import random +def generate_key(): + STR_KEY_GEN = 'ABCDEFGHIJKLMNOPQRSTUVWXYzabcdefghijklmnopqrstuvwxyz' + return ''.join(random.choice(STR_KEY_GEN) for _ in range(20)) + +erf = "errorlog" + generate_key() + def create_video_folders(dataset, output_dir, tmp_dir): """Creates a directory for each label name in the dataset.""" @@ -52,7 +59,7 @@ def construct_video_filename(row, label_to_dir, trim_format='%06d'): def download_clip(video_identifier, output_filename, start_time, end_time, tmp_dir='/tmp/kinetics', - num_attempts=5, + num_attempts=2, url_base='https://www.youtube.com/watch?v='): """Download a video from youtube if exists and is not blocked. @@ -76,48 +83,30 @@ def download_clip(video_identifier, output_filename, status = False # Construct command line for getting the direct video link. tmp_filename = os.path.join(tmp_dir, - '%s.%%(ext)s' % uuid.uuid4()) - + f'{video_identifier}.mp4') + if not os.path.exists(output_filename): - if not os.path.exists(tmp_filename): - command = ['youtube-dl', - '--quiet', '--no-warnings', - '-f', 'mp4', - '-o', '"%s"' % tmp_filename, - '"%s"' % (url_base + video_identifier)] - command = ' '.join(command) - attempts = 0 - while True: - try: - output = subprocess.check_output(command, shell=True, - stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as err: - attempts += 1 - if attempts == num_attempts: - return status, err.output - else: - break - - tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] - # Construct command to trim the videos (ffmpeg required). - command = ['ffmpeg', - '-i', '"%s"' % tmp_filename, - '-ss', str(start_time), - '-t', str(end_time - start_time), - '-c:v', 'libx264', '-c:a', 'copy', - '-threads', '1', - '-loglevel', 'panic', - '"%s"' % output_filename] - command = ' '.join(command) - try: - output = subprocess.check_output(command, shell=True, - stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as err: - return status, err.output + + command = f'youtube-dl -f mp4 --quiet --no-warnings -o {tmp_filename} "{url_base + video_identifier}" && ffmpeg -i {tmp_filename} -ss {str(start_time)} -t {str(end_time - start_time)} -threads 1 -c:v libx264 -c:a copy -loglevel error "{output_filename}" -f mp4 -y' + + attempts = 0 + while True: + try: + output = subprocess.check_output(command, shell=True, + stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as err: + attempts += 1 + if attempts == num_attempts: + return status, f"{str(err.output)[:500]}" + else: + break # Check if the video was successfully saved. status = os.path.exists(output_filename) - os.remove(tmp_filename) + try: + os.remove(tmp_filename) + except: + pass return status, 'Downloaded' @@ -134,9 +123,46 @@ def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): row['start-time'], row['end-time'], tmp_dir=tmp_dir) status = tuple([clip_id, downloaded, log]) + print(status) return status + +def redownload_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): + """Wrapper for parallel processing purposes.""" + output_filename = construct_video_filename(row, label_to_dir, + trim_format) + clip_id = os.path.basename(output_filename).split('.mp4')[0] + if os.path.exists(output_filename): + + check_for_errors_in_file = f'ffmpeg -v error -i "{output_filename}" -f null - 2>{erf} && cat {erf}' + try: + output = subprocess.check_output(check_for_errors_in_file, shell=True, stderr=subprocess.STDOUT) + if not output: + status = tuple([clip_id, True, 'Exists']) + print(status) + return status + except subprocess.CalledProcessError as err: + print(err) + + print(f"Removing corrupted file: {output_filename}") + try: + os.remove(output_filename) + except: + pass + downloaded, log = download_clip(row['video-id'], output_filename, + row['start-time'], row['end-time'], + tmp_dir=tmp_dir) + status = tuple([clip_id, downloaded, log]) + print(status) + return status + else: + # Was never able to download clip + status = tuple([clip_id, True, "Could not download"]) + print(status) + return status + + def parse_kinetics_annotations(input_csv, ignore_is_cc=False): """Returns a parsed DataFrame. @@ -167,7 +193,7 @@ def parse_kinetics_annotations(input_csv, ignore_is_cc=False): def main(input_csv, output_dir, trim_format='%06d', num_jobs=24, tmp_dir='/tmp/kinetics', - drop_duplicates=False): + drop_duplicates=False, download_mode="download"): # Reading and parsing Kinetics. dataset = parse_kinetics_annotations(input_csv) @@ -184,14 +210,18 @@ def main(input_csv, output_dir, # Creates folders where videos will be saved later. label_to_dir = create_video_folders(dataset, output_dir, tmp_dir) + run = { + "download": download_clip_wrapper, + "redownload": redownload_clip_wrapper + }[download_mode] + # Download all clips. if num_jobs == 1: status_lst = [] for i, row in dataset.iterrows(): - status_lst.append(download_clip_wrapper(row, label_to_dir, - trim_format, tmp_dir)) + status_lst.append(run(row, label_to_dir, trim_format, tmp_dir)) else: - status_lst = Parallel(n_jobs=num_jobs)(delayed(download_clip_wrapper)( + status_lst = Parallel(n_jobs=num_jobs)(delayed(run)( row, label_to_dir, trim_format, tmp_dir) for i, row in dataset.iterrows()) @@ -215,9 +245,9 @@ def main(input_csv, output_dir, help=('This will be the format for the ' 'filename of trimmed videos: ' 'videoid_%0xd(start_time)_%0xd(end_time).mp4')) - p.add_argument('-n', '--num-jobs', type=int, default=24) + p.add_argument('-n', '--num-jobs', type=int, default=1) p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics') p.add_argument('--drop-duplicates', type=str, default='non-existent', help='Unavailable at the moment') - # help='CSV file of the previous version of Kinetics.') + p.add_argument('-m', '--download_mode', type=str, default='download', choices=["download", "redownload"]) main(**vars(p.parse_args())) From d33fb631866bd1551e029a7d8e559e4ef93bfe44 Mon Sep 17 00:00:00 2001 From: LukasHedegaard Date: Mon, 9 Nov 2020 08:22:31 +0000 Subject: [PATCH 5/6] Add source dir lookup --- Crawler/Kinetics/download.py | 42 +++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/Crawler/Kinetics/download.py b/Crawler/Kinetics/download.py index 7444841..0bd5f3e 100755 --- a/Crawler/Kinetics/download.py +++ b/Crawler/Kinetics/download.py @@ -4,8 +4,10 @@ import os import shutil import subprocess +from typing import Dict, List, Set import uuid from collections import OrderedDict +from pathlib import Path from joblib import delayed from joblib import Parallel @@ -110,13 +112,19 @@ def download_clip(video_identifier, output_filename, return status, 'Downloaded' -def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): +def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir, existing_files:Dict[str,Path]={}): """Wrapper for parallel processing purposes.""" output_filename = construct_video_filename(row, label_to_dir, trim_format) clip_id = os.path.basename(output_filename).split('.mp4')[0] if os.path.exists(output_filename): status = tuple([clip_id, True, 'Exists']) + print(status) + return status + if clip_id in existing_files: + shutil.copyfile(src=existing_files[clip_id],dst=output_filename) + status = tuple([clip_id, True, 'Copied']) + print(status) return status downloaded, log = download_clip(row['video-id'], output_filename, @@ -126,8 +134,6 @@ def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): print(status) return status - - def redownload_clip_wrapper(row, label_to_dir, trim_format, tmp_dir): """Wrapper for parallel processing purposes.""" output_filename = construct_video_filename(row, label_to_dir, @@ -193,19 +199,19 @@ def parse_kinetics_annotations(input_csv, ignore_is_cc=False): def main(input_csv, output_dir, trim_format='%06d', num_jobs=24, tmp_dir='/tmp/kinetics', - drop_duplicates=False, download_mode="download"): + drop_duplicates=False, download_mode="download", source_dir=""): # Reading and parsing Kinetics. dataset = parse_kinetics_annotations(input_csv) - # if os.path.isfile(drop_duplicates): - # print('Attempt to remove duplicates') - # old_dataset = parse_kinetics_annotations(drop_duplicates, - # ignore_is_cc=True) - # df = pd.concat([dataset, old_dataset], axis=0, ignore_index=True) - # df.drop_duplicates(inplace=True, keep=False) - # print(dataset.shape, old_dataset.shape) - # dataset = df - # print(dataset.shape) + + # Make catalogue of existing files + source_dir = Path(source_dir) + if source_dir.is_dir(): + print("Looking through source dir") + existing_files = {str(p.stem): p for p in source_dir.rglob("*.mp4")} if source_dir.is_dir() else {} + print("Done looking") + else: + existing_files = {} # Creates folders where videos will be saved later. label_to_dir = create_video_folders(dataset, output_dir, tmp_dir) @@ -219,11 +225,12 @@ def main(input_csv, output_dir, if num_jobs == 1: status_lst = [] for i, row in dataset.iterrows(): - status_lst.append(run(row, label_to_dir, trim_format, tmp_dir)) + status_lst.append(run(row, label_to_dir, trim_format, tmp_dir, existing_files)) else: - status_lst = Parallel(n_jobs=num_jobs)(delayed(run)( - row, label_to_dir, - trim_format, tmp_dir) for i, row in dataset.iterrows()) + status_lst = Parallel(n_jobs=num_jobs)( + delayed(run)(row, label_to_dir, trim_format, tmp_dir) + for i, row in dataset.iterrows() + ) # Clean tmp dir. shutil.rmtree(tmp_dir) @@ -250,4 +257,5 @@ def main(input_csv, output_dir, p.add_argument('--drop-duplicates', type=str, default='non-existent', help='Unavailable at the moment') p.add_argument('-m', '--download_mode', type=str, default='download', choices=["download", "redownload"]) + p.add_argument('-s', '--source-dir', type=str, default='', help="Directory in which to look for files before download.") main(**vars(p.parse_args())) From 408c89d4fea6b74baa06c95ecb95db4908afdb84 Mon Sep 17 00:00:00 2001 From: LukasHedegaard Date: Mon, 11 Jan 2021 09:28:13 +0000 Subject: [PATCH 6/6] Add reverse download direction --- Crawler/Kinetics/download.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Crawler/Kinetics/download.py b/Crawler/Kinetics/download.py index 0bd5f3e..bb840f0 100755 --- a/Crawler/Kinetics/download.py +++ b/Crawler/Kinetics/download.py @@ -12,6 +12,7 @@ from joblib import delayed from joblib import Parallel import pandas as pd +from functools import partial import random def generate_key(): @@ -199,10 +200,12 @@ def parse_kinetics_annotations(input_csv, ignore_is_cc=False): def main(input_csv, output_dir, trim_format='%06d', num_jobs=24, tmp_dir='/tmp/kinetics', - drop_duplicates=False, download_mode="download", source_dir=""): + drop_duplicates=False, download_mode="download", source_dir="", reverse=False): # Reading and parsing Kinetics. dataset = parse_kinetics_annotations(input_csv) + if reverse: + dataset.reindex(index=dataset.index[::-1]) # Make catalogue of existing files source_dir = Path(source_dir) @@ -217,7 +220,7 @@ def main(input_csv, output_dir, label_to_dir = create_video_folders(dataset, output_dir, tmp_dir) run = { - "download": download_clip_wrapper, + "download": partial(download_clip_wrapper, existing_files=existing_files), "redownload": redownload_clip_wrapper }[download_mode] @@ -225,7 +228,7 @@ def main(input_csv, output_dir, if num_jobs == 1: status_lst = [] for i, row in dataset.iterrows(): - status_lst.append(run(row, label_to_dir, trim_format, tmp_dir, existing_files)) + status_lst.append(run(row, label_to_dir, trim_format, tmp_dir)) else: status_lst = Parallel(n_jobs=num_jobs)( delayed(run)(row, label_to_dir, trim_format, tmp_dir) @@ -258,4 +261,5 @@ def main(input_csv, output_dir, help='Unavailable at the moment') p.add_argument('-m', '--download_mode', type=str, default='download', choices=["download", "redownload"]) p.add_argument('-s', '--source-dir', type=str, default='', help="Directory in which to look for files before download.") + p.add_argument('-r', '--reverse', action="store_true", help="Reverse direction of download.") main(**vars(p.parse_args()))