File size: 9,878 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
import csv
import os
import urllib
from functools import partial
from multiprocessing import Pool
from os import path
from pathlib import Path
from typing import Any, Callable, Dict, Optional, Tuple, Union
from torch import Tensor
from .folder import find_classes, make_dataset
from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg
from .video_utils import VideoClips
from .vision import VisionDataset
def _dl_wrap(tarpath: Union[str, Path], videopath: Union[str, Path], line: str) -> None:
download_and_extract_archive(line, tarpath, videopath)
class Kinetics(VisionDataset):
"""`Generic Kinetics <https://www.deepmind.com/open-source/kinetics>`_
dataset.
Kinetics-400/600/700 are action recognition video datasets.
This dataset consider every video as a collection of video clips of fixed size, specified
by ``frames_per_clip``, where the step in frames between each clip is given by
``step_between_clips``.
To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
elements will come from video 1, and the next three elements from video 2.
Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
frames in a video might be present.
Args:
root (str or ``pathlib.Path``): Root directory of the Kinetics Dataset.
Directory should be structured as follows:
.. code::
root/
βββ split
β βββ class1
β β βββ vid1.mp4
β β βββ vid2.mp4
β β βββ vid3.mp4
β β βββ ...
β βββ class2
β β βββ vidx.mp4
β β βββ ...
Note: split is appended automatically using the split argument.
frames_per_clip (int): number of frames in a clip
num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` ``"test"``
frame_rate (float): If omitted, interpolate different frame rate for each clip.
step_between_clips (int): number of frames between each clip
transform (callable, optional): A function/transform that takes in a TxHxWxC video
and returns a transformed version.
download (bool): Download the official version of the dataset to root folder.
num_workers (int): Use multiple workers for VideoClips creation
num_download_workers (int): Use multiprocessing in order to speed up download.
output_format (str, optional): The format of the output video tensors (before transforms).
Can be either "THWC" or "TCHW" (default).
Note that in most other utils and datasets, the default is actually "THWC".
Returns:
tuple: A 3-tuple with the following entries:
- video (Tensor[T, C, H, W] or Tensor[T, H, W, C]): the `T` video frames in torch.uint8 tensor
- audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
and `L` is the number of points in torch.float tensor
- label (int): class of the video clip
Raises:
RuntimeError: If ``download is True`` and the video archives are already extracted.
"""
_TAR_URLS = {
"400": "https://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txt",
"600": "https://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txt",
"700": "https://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt",
}
_ANNOTATION_URLS = {
"400": "https://s3.amazonaws.com/kinetics/400/annotations/{split}.csv",
"600": "https://s3.amazonaws.com/kinetics/600/annotations/{split}.csv",
"700": "https://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csv",
}
def __init__(
self,
root: Union[str, Path],
frames_per_clip: int,
num_classes: str = "400",
split: str = "train",
frame_rate: Optional[int] = None,
step_between_clips: int = 1,
transform: Optional[Callable] = None,
extensions: Tuple[str, ...] = ("avi", "mp4"),
download: bool = False,
num_download_workers: int = 1,
num_workers: int = 1,
_precomputed_metadata: Optional[Dict[str, Any]] = None,
_video_width: int = 0,
_video_height: int = 0,
_video_min_dimension: int = 0,
_audio_samples: int = 0,
_audio_channels: int = 0,
_legacy: bool = False,
output_format: str = "TCHW",
) -> None:
# TODO: support test
self.num_classes = verify_str_arg(num_classes, arg="num_classes", valid_values=["400", "600", "700"])
self.extensions = extensions
self.num_download_workers = num_download_workers
self.root = root
self._legacy = _legacy
if _legacy:
self.split_folder = root
self.split = "unknown"
output_format = "THWC"
if download:
raise ValueError("Cannot download the videos using legacy_structure.")
else:
self.split_folder = path.join(root, split)
self.split = verify_str_arg(split, arg="split", valid_values=["train", "val", "test"])
if download:
self.download_and_process_videos()
super().__init__(self.root)
self.classes, class_to_idx = find_classes(self.split_folder)
self.samples = make_dataset(self.split_folder, class_to_idx, extensions, is_valid_file=None)
video_list = [x[0] for x in self.samples]
self.video_clips = VideoClips(
video_list,
frames_per_clip,
step_between_clips,
frame_rate,
_precomputed_metadata,
num_workers=num_workers,
_video_width=_video_width,
_video_height=_video_height,
_video_min_dimension=_video_min_dimension,
_audio_samples=_audio_samples,
_audio_channels=_audio_channels,
output_format=output_format,
)
self.transform = transform
def download_and_process_videos(self) -> None:
"""Downloads all the videos to the _root_ folder in the expected format."""
self._download_videos()
self._make_ds_structure()
def _download_videos(self) -> None:
"""download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
split is one of the official dataset splits.
Raises:
RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
"""
if path.exists(self.split_folder):
return
tar_path = path.join(self.root, "tars")
file_list_path = path.join(self.root, "files")
split_url = self._TAR_URLS[self.num_classes].format(split=self.split)
split_url_filepath = path.join(file_list_path, path.basename(split_url))
if not check_integrity(split_url_filepath):
download_url(split_url, file_list_path)
with open(split_url_filepath) as file:
list_video_urls = [urllib.parse.quote(line, safe="/,:") for line in file.read().splitlines()]
if self.num_download_workers == 1:
for line in list_video_urls:
download_and_extract_archive(line, tar_path, self.split_folder)
else:
part = partial(_dl_wrap, tar_path, self.split_folder)
poolproc = Pool(self.num_download_workers)
poolproc.map(part, list_video_urls)
def _make_ds_structure(self) -> None:
"""move videos from
split_folder/
βββ clip1.avi
βββ clip2.avi
to the correct format as described below:
split_folder/
βββ class1
β βββ clip1.avi
"""
annotation_path = path.join(self.root, "annotations")
if not check_integrity(path.join(annotation_path, f"{self.split}.csv")):
download_url(self._ANNOTATION_URLS[self.num_classes].format(split=self.split), annotation_path)
annotations = path.join(annotation_path, f"{self.split}.csv")
file_fmtstr = "{ytid}_{start:06}_{end:06}.mp4"
with open(annotations) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
f = file_fmtstr.format(
ytid=row["youtube_id"],
start=int(row["time_start"]),
end=int(row["time_end"]),
)
label = row["label"].replace(" ", "_").replace("'", "").replace("(", "").replace(")", "")
os.makedirs(path.join(self.split_folder, label), exist_ok=True)
downloaded_file = path.join(self.split_folder, f)
if path.isfile(downloaded_file):
os.replace(
downloaded_file,
path.join(self.split_folder, label, f),
)
@property
def metadata(self) -> Dict[str, Any]:
return self.video_clips.metadata
def __len__(self) -> int:
return self.video_clips.num_clips()
def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
video, audio, info, video_idx = self.video_clips.get_clip(idx)
label = self.samples[video_idx][1]
if self.transform is not None:
video = self.transform(video)
return video, audio, label
|