tja-generator / preprocess.py
github-actions[bot]
Sync to HuggingFace Spaces
9df2e22
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
from librosa.filters import mel
from scipy import signal
from scipy.fftpack import fft
class Audio:
"""
audio class which holds music data and timestamp for notes.
Args:
filename: file name.
stereo: True or False; wether you have Don/Ka streo file or not. normaly True.
Variables:
Example:
>>>from music_processor import *
>>>song = Audio(filename)
>>># to get audio data
>>>song.data
>>># to import .tja files:
>>>song.import_tja(filename)
>>># to get data converted
>>>song.data = (song.data[:,0]+song.data[:,1])/2
>>>fft_and_melscale(song, include_zero_cross=False)
"""
def __init__(self, data, samplerate, stereo=True):
self.data = data
self.samplerate = samplerate
if stereo is False:
self.data = (self.data[:, 0] + self.data[:, 1]) / 2
self.timestamp = []
def plotaudio(self, start_t, stop_t):
plt.plot(
np.linspace(start_t, stop_t, stop_t - start_t), self.data[start_t:stop_t, 0]
)
plt.show()
def save(self, filename, start_t=0, stop_t=None):
if stop_t is None:
stop_t = self.data.shape[0]
sf.write(filename, self.data[start_t:stop_t], self.samplerate)
def synthesize(self, diff=True, don="./asset/don.wav", ka="./asset/ka.wav"):
donsound = sf.read(don)[0]
donsound = (donsound[:, 0] + donsound[:, 1]) / 2
kasound = sf.read(ka)[0]
kasound = (kasound[:, 0] + kasound[:, 1]) / 2
donlen = len(donsound)
kalen = len(kasound)
if diff is True:
for stamp in self.timestamp:
timing = int(stamp[0] * self.samplerate)
try:
if stamp[1] in (1, 3, 5, 6, 7):
self.data[timing : timing + donlen] += donsound
elif stamp[1] in (2, 4):
self.data[timing : timing + kalen] += kasound
except ValueError:
pass
elif diff == "don":
if isinstance(self.timestamp[0], tuple):
for stamp in self.timestamp:
if stamp * self.samplerate + donlen < self.data.shape[0]:
self.data[
int(stamp[0] * self.samplerate) : int(
stamp[0] * self.samplerate
)
+ donlen
] += donsound
else:
for stamp in self.timestamp:
if stamp * self.samplerate + donlen < self.data.shape[0]:
self.data[
int(stamp * self.samplerate) : int(stamp * self.samplerate)
+ donlen
] += donsound
elif diff == "ka":
if isinstance(self.timestamp[0], tuple):
for stamp in self.timestamp:
if stamp * self.samplerate + kalen < self.data.shape[0]:
self.data[
int(stamp[0] * self.samplerate) : int(
stamp[0] * self.samplerate
)
+ kalen
] += kasound
else:
for stamp in self.timestamp:
if stamp * self.samplerate + kalen < self.data.shape[0]:
self.data[
int(stamp * self.samplerate) : int(stamp * self.samplerate)
+ kalen
] += kasound
def make_frame(data, nhop, nfft):
"""
helping function for fftandmelscale.
細かい時間に切り分けたものを学習データとするため,nhop(512)ずつずらしながらnfftサイズのデータを配列として返す
"""
length = data.shape[0]
framedata = np.concatenate((data, np.zeros(nfft))) # zero padding
return np.array(
[framedata[i * nhop : i * nhop + nfft] for i in range(length // nhop)]
)
# @jit
def fft_and_melscale(
song,
nhop=512,
nffts=[1024, 2048, 4096],
mel_nband=80,
mel_freqlo=27.5,
mel_freqhi=16000.0,
include_zero_cross=False,
):
"""
fft and melscale method.
fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う.
melscale: 周波数の次元を削減するとともに,log10の値を取っている.
"""
feat_channels = []
for nfft in nffts:
feats = []
window = signal.windows.blackmanharris(nfft)
filt = mel(
sr=song.samplerate,
n_fft=nfft,
n_mels=mel_nband,
fmin=mel_freqlo,
fmax=mel_freqhi,
)
# get normal frame
frame = make_frame(song.data, nhop, nfft)
# print(frame.shape)
# melscaling
processedframe = fft(window * frame)[:, : nfft // 2 + 1]
processedframe = np.dot(filt, np.transpose(np.abs(processedframe) ** 2))
processedframe = 20 * np.log10(processedframe + 0.1)
# print(processedframe.shape)
feat_channels.append(processedframe)
if include_zero_cross:
song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
print(song.zero_crossing)
return np.array(feat_channels)