import matplotlib.pyplot as plt import numpy as np import soundfile as sf from librosa.filters import mel from scipy import signal from scipy.fftpack import fft class Audio: """ audio class which holds music data and timestamp for notes. Args: filename: file name. stereo: True or False; wether you have Don/Ka streo file or not. normaly True. Variables: Example: >>>from music_processor import * >>>song = Audio(filename) >>># to get audio data >>>song.data >>># to import .tja files: >>>song.import_tja(filename) >>># to get data converted >>>song.data = (song.data[:,0]+song.data[:,1])/2 >>>fft_and_melscale(song, include_zero_cross=False) """ def __init__(self, data, samplerate, stereo=True): self.data = data self.samplerate = samplerate if stereo is False: self.data = (self.data[:, 0] + self.data[:, 1]) / 2 self.timestamp = [] def plotaudio(self, start_t, stop_t): plt.plot( np.linspace(start_t, stop_t, stop_t - start_t), self.data[start_t:stop_t, 0] ) plt.show() def save(self, filename, start_t=0, stop_t=None): if stop_t is None: stop_t = self.data.shape[0] sf.write(filename, self.data[start_t:stop_t], self.samplerate) def synthesize(self, diff=True, don="./asset/don.wav", ka="./asset/ka.wav"): donsound = sf.read(don)[0] donsound = (donsound[:, 0] + donsound[:, 1]) / 2 kasound = sf.read(ka)[0] kasound = (kasound[:, 0] + kasound[:, 1]) / 2 donlen = len(donsound) kalen = len(kasound) if diff is True: for stamp in self.timestamp: timing = int(stamp[0] * self.samplerate) try: if stamp[1] in (1, 3, 5, 6, 7): self.data[timing : timing + donlen] += donsound elif stamp[1] in (2, 4): self.data[timing : timing + kalen] += kasound except ValueError: pass elif diff == "don": if isinstance(self.timestamp[0], tuple): for stamp in self.timestamp: if stamp * self.samplerate + donlen < self.data.shape[0]: self.data[ int(stamp[0] * self.samplerate) : int( stamp[0] * self.samplerate ) + donlen ] += donsound else: for stamp in self.timestamp: if stamp * self.samplerate + donlen < self.data.shape[0]: self.data[ int(stamp * self.samplerate) : int(stamp * self.samplerate) + donlen ] += donsound elif diff == "ka": if isinstance(self.timestamp[0], tuple): for stamp in self.timestamp: if stamp * self.samplerate + kalen < self.data.shape[0]: self.data[ int(stamp[0] * self.samplerate) : int( stamp[0] * self.samplerate ) + kalen ] += kasound else: for stamp in self.timestamp: if stamp * self.samplerate + kalen < self.data.shape[0]: self.data[ int(stamp * self.samplerate) : int(stamp * self.samplerate) + kalen ] += kasound def make_frame(data, nhop, nfft): """ helping function for fftandmelscale. 細かい時間に切り分けたものを学習データとするため,nhop(512)ずつずらしながらnfftサイズのデータを配列として返す """ length = data.shape[0] framedata = np.concatenate((data, np.zeros(nfft))) # zero padding return np.array( [framedata[i * nhop : i * nhop + nfft] for i in range(length // nhop)] ) # @jit def fft_and_melscale( song, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False, ): """ fft and melscale method. fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う. melscale: 周波数の次元を削減するとともに,log10の値を取っている. """ feat_channels = [] for nfft in nffts: feats = [] window = signal.windows.blackmanharris(nfft) filt = mel( sr=song.samplerate, n_fft=nfft, n_mels=mel_nband, fmin=mel_freqlo, fmax=mel_freqhi, ) # get normal frame frame = make_frame(song.data, nhop, nfft) # print(frame.shape) # melscaling processedframe = fft(window * frame)[:, : nfft // 2 + 1] processedframe = np.dot(filt, np.transpose(np.abs(processedframe) ** 2)) processedframe = 20 * np.log10(processedframe + 0.1) # print(processedframe.shape) feat_channels.append(processedframe) if include_zero_cross: song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0] print(song.zero_crossing) return np.array(feat_channels)