Spaces:

jamtur01
/

MMaDA

Runtime error

App Files Files Community

MMaDA / venv /lib /python3.11 /site-packages /torchvision /datasets /ucf101.py

jamtur01

Upload folder using huggingface_hub

9c6594c verified about 1 month ago

raw

history blame contribute delete

5.53 kB

	import os
	from pathlib import Path
	from typing import Any, Callable, Dict, List, Optional, Tuple, Union

	from torch import Tensor

	from .folder import find_classes, make_dataset
	from .video_utils import VideoClips
	from .vision import VisionDataset


	class UCF101(VisionDataset):
	"""
	`UCF101 <https://www.crcv.ucf.edu/data/UCF101.php>`_ dataset.

	UCF101 is an action recognition video dataset.
	This dataset consider every video as a collection of video clips of fixed size, specified
	by ``frames_per_clip``, where the step in frames between each clip is given by
	``step_between_clips``. The dataset itself can be downloaded from the dataset website;
	annotations that ``annotation_path`` should be pointing to can be downloaded from `here
	<https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip>`_.

	To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
	and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
	elements will come from video 1, and the next three elements from video 2.
	Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
	frames in a video might be present.

	Internally, it uses a VideoClips object to handle clip creation.

	Args:
	root (str or ``pathlib.Path``): Root directory of the UCF101 Dataset.
	annotation_path (str): path to the folder containing the split files;
	see docstring above for download instructions of these files
	frames_per_clip (int): number of frames in a clip.
	step_between_clips (int, optional): number of frames between each clip.
	fold (int, optional): which fold to use. Should be between 1 and 3.
	train (bool, optional): if ``True``, creates a dataset from the train split,
	otherwise from the ``test`` split.
	transform (callable, optional): A function/transform that takes in a TxHxWxC video
	and returns a transformed version.
	output_format (str, optional): The format of the output video tensors (before transforms).
	Can be either "THWC" (default) or "TCHW".

	Returns:
	tuple: A 3-tuple with the following entries:

	- video (Tensor[T, H, W, C] or Tensor[T, C, H, W]): The `T` video frames
	- audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
	and `L` is the number of points
	- label (int): class of the video clip
	"""

	def __init__(
	self,
	root: Union[str, Path],
	annotation_path: str,
	frames_per_clip: int,
	step_between_clips: int = 1,
	frame_rate: Optional[int] = None,
	fold: int = 1,
	train: bool = True,
	transform: Optional[Callable] = None,
	_precomputed_metadata: Optional[Dict[str, Any]] = None,
	num_workers: int = 1,
	_video_width: int = 0,
	_video_height: int = 0,
	_video_min_dimension: int = 0,
	_audio_samples: int = 0,
	output_format: str = "THWC",
	) -> None:
	super().__init__(root)
	if not 1 <= fold <= 3:
	raise ValueError(f"fold should be between 1 and 3, got {fold}")

	extensions = ("avi",)
	self.fold = fold
	self.train = train

	self.classes, class_to_idx = find_classes(self.root)
	self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None)
	video_list = [x[0] for x in self.samples]
	video_clips = VideoClips(
	video_list,
	frames_per_clip,
	step_between_clips,
	frame_rate,
	_precomputed_metadata,
	num_workers=num_workers,
	_video_width=_video_width,
	_video_height=_video_height,
	_video_min_dimension=_video_min_dimension,
	_audio_samples=_audio_samples,
	output_format=output_format,
	)
	# we bookkeep the full version of video clips because we want to be able
	# to return the metadata of full version rather than the subset version of
	# video clips
	self.full_video_clips = video_clips
	self.indices = self._select_fold(video_list, annotation_path, fold, train)
	self.video_clips = video_clips.subset(self.indices)
	self.transform = transform

	@property
	def metadata(self) -> Dict[str, Any]:
	return self.full_video_clips.metadata

	def _select_fold(self, video_list: List[str], annotation_path: str, fold: int, train: bool) -> List[int]:
	name = "train" if train else "test"
	name = f"{name}list{fold:02d}.txt"
	f = os.path.join(annotation_path, name)
	selected_files = set()
	with open(f) as fid:
	data = fid.readlines()
	data = [x.strip().split(" ")[0] for x in data]
	data = [os.path.join(self.root, *x.split("/")) for x in data]
	selected_files.update(data)
	indices = [i for i in range(len(video_list)) if video_list[i] in selected_files]
	return indices

	def __len__(self) -> int:
	return self.video_clips.num_clips()

	def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
	video, audio, info, video_idx = self.video_clips.get_clip(idx)
	label = self.samples[self.indices[video_idx]][1]

	if self.transform is not None:
	video = self.transform(video)

	return video, audio, label