Spaces:

terapyon
/

podcast-search

Running

App Files Files Community

podcast-search / src /episode.py

terapyon

srtの分割を1分にし、configなどを整え、READMEを書いた

d788666 about 1 year ago

raw

history blame contribute delete

3.14 kB

	from dataclasses import dataclass
	from datetime import time as dt_time
	from datetime import timedelta
	import pandas as pd
	from config import STORE_DIR, DATA_DIR, divider_time, RE_PODCAST_SRT_FILE


	@dataclass
	class SplitedText:
	part: int
	start: int
	end: int
	text: str


	@dataclass
	class Episode:
	id_: int
	title: str \| None
	texts: list[SplitedText]


	def str_to_timedelta(s: str) -> timedelta:
	t = dt_time.fromisoformat(s)
	return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second)


	def make_episode(id_: int, title: str, srt_filename: str) -> Episode:
	episode = Episode(
	id_=id_,
	title=title,
	texts=[]
	)
	part = 1
	start = None
	end = None
	text = None

	with open(srt_filename) as f:
	for line in f:
	first = None
	second = None
	line_text = None
	if line.strip().isdigit():
	continue
	elif line.strip() == "":
	continue
	elif "-->" in line:
	first_str, second_str = line.strip().split("-->")
	first = str_to_timedelta(first_str.strip())
	second = str_to_timedelta(second_str.strip())
	else:
	line_text = line.strip()

	if first:
	if start is None:
	start = first
	if line_text:
	if text is None:
	text = line_text
	else:
	text += "\n" + line_text

	if start and second and text:
	if abs(second - start) > divider_time:
	end = second
	st = SplitedText(part=part,
	start=int(start.total_seconds()),
	end=int(end.total_seconds()),
	text=text)
	episode.texts.append(st)

	# print(text)
	part += 1
	start = None
	text = None
	# print(episode)
	print(len(episode.texts))
	return episode


	def make_df(episode: Episode) -> pd.DataFrame:
	data = []
	for text in episode.texts:
	data.append([episode.id_, text.part, text.start, text.end, text.text])
	df = pd.DataFrame(data, columns=["id", "part", "start", "end_", "text"])
	return df


	def get_srt_files():
	lst = []
	for file_path in DATA_DIR.glob("*.srt"):
	m = RE_PODCAST_SRT_FILE.search(file_path.name)
	if m is not None:
	filename = file_path.name
	id_ = int(m.group(1))
	lst.append({"id": id_, "srt": filename})
	return lst


	def main():
	lst = sorted(get_srt_files(), key=lambda x: x["id"])
	print(f"{len(lst)=}")
	for item in lst:
	print(item["id"])
	episode = make_episode(item["id"], item.get("title"), DATA_DIR / item["srt"])
	df = make_df(episode)
	# print(df)
	df.to_parquet(STORE_DIR / f"podcast-{item['id']}.parquet")
	# break


	if __name__ == "__main__":
	main()