Commit 03a8accc authored by Bernhard Geier's avatar Bernhard Geier
Browse files

add playlist from BR's website to comments

make both scripts more similar
parent c09f06c1
...@@ -15,7 +15,12 @@ import lxml ...@@ -15,7 +15,12 @@ import lxml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
#import pprint #import pprint
baseUrl="https://www.br.de/radio/bayern2/sendungen/nachtmix/index.html";
baseUrl = "https://www.br.de/radio/bayern2/sendungen/nachtmix/index.html"
playlistsBaseUrl = "https://www.br.de/radio/bayern2/sendungen/nachtmix/playlisten/index.html"
minimalEpisodeDuration_ms = 45 * 60 * 1000
showTitle = "Nachtmix"
def download(url: str, attempts=4): def download(url: str, attempts=4):
tmpfile = NamedTemporaryFile(delete=False) tmpfile = NamedTemporaryFile(delete=False)
...@@ -31,20 +36,6 @@ def download(url: str, attempts=4): ...@@ -31,20 +36,6 @@ def download(url: str, attempts=4):
pass pass
return None return None
if len(sys.argv) != 2:
print("Usage:", file=sys.stderr)
print("%s <DownloadDir>\n" % sys.argv[0], file=sys.stderr)
print("Example:", file=sys.stderr)
print("%s 'Downloads/Nachtmix Recordings'\n" % sys.argv[0], file=sys.stderr)
sys.exit(1)
DESTDIR = sys.argv[1]
if not os.path.isdir(DESTDIR):
print("Directory %s does not exist!" % DESTDIR, file=sys.stderr)
sys.exit(1)
def time2seconds(timestr: str): def time2seconds(timestr: str):
# return duration of HH:MM:SS in seconds # return duration of HH:MM:SS in seconds
parts = re.split(":", timestr) parts = re.split(":", timestr)
...@@ -57,6 +48,48 @@ def safe_text_get(l: list, idx: int, default=None): ...@@ -57,6 +48,48 @@ def safe_text_get(l: list, idx: int, default=None):
except IndexError: except IndexError:
return default return default
def get_playlist_as_text(dt: datetime):
try:
# get website with calender entries with all available playlists
html = requests.get(playlistsBaseUrl, timeout=5).text
soup = BeautifulSoup(html, 'lxml')
# select day
dayLink = soup.find('a', class_=re.compile('^playlisten.+'), href=re.compile('.+_date\-'+dt.strftime("%Y")+'\-'+dt.strftime("%m")+'\-'+dt.strftime("%d")+'_.+\.html$'))['href']
dayUrl = urllib.parse.urljoin(playlistsBaseUrl, dayLink)
# follow link to playlist
html = requests.get(dayUrl, timeout=5).text
soup = BeautifulSoup(html, 'lxml')
plsLink = soup.find('a', class_=re.compile("^playlist(\-"+showTitle.lower()+")?\-\d+$"), href=re.compile('.+playlist(\-'+showTitle.lower()+')?\-\d+.html$'))['href']
plsUrl = urllib.parse.urljoin(playlistsBaseUrl, plsLink)
# read playlist
html = requests.get(plsUrl, timeout=5).text
soup = BeautifulSoup(html, 'lxml')
playlistEntries = []
for entry in soup.select('div.detail_content > p.copytext'):
playlistEntries.append(" - ".join(entry.find_all(text=True)))
return(" | ".join(playlistEntries))
except:
return None
if len(sys.argv) != 2:
print("Usage:", file=sys.stderr)
print("%s <DownloadDir>\n" % sys.argv[0], file=sys.stderr)
print("Example:", file=sys.stderr)
print("%s 'Downloads/%s Recordings'\n" % (sys.argv[0], showTitle), file=sys.stderr)
sys.exit(1)
DESTDIR = sys.argv[1]
if not os.path.isdir(DESTDIR):
print("Directory %s does not exist!" % DESTDIR, file=sys.stderr)
sys.exit(1)
html = requests.get(baseUrl, timeout=5).text html = requests.get(baseUrl, timeout=5).text
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
...@@ -151,14 +184,15 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -151,14 +184,15 @@ for bc in broadcastJson['channelBroadcasts']:
'filename': None, 'filename': None,
'filepath': None, 'filepath': None,
'duration_ms': time2seconds(XMLmeta['duration']) * 1000, 'duration_ms': time2seconds(XMLmeta['duration']) * 1000,
'playlist_text': None,
} }
## Filter out some episodes ## Filter out some episodes
# I know that a real Nachtmix episode is longer than 45 minutes. Skip this episode if it is shorter # Skip this episode if it is shorter than defined minimal duration
if meta['duration_ms'] < 45 * 60 * 1000: if meta['duration_ms'] < minimalEpisodeDuration_ms:
continue continue
# Skip all non "Nachtmix" broadcasts # Skip this episode if "Broadcast" is not matching the show's title
if XMLmeta['broadcast'].lower() != 'nachtmix': if XMLmeta['broadcast'].lower() != showTitle.lower():
continue continue
...@@ -177,7 +211,7 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -177,7 +211,7 @@ for bc in broadcastJson['channelBroadcasts']:
## Populate values in "meta" dict ## Populate values in "meta" dict
# agf_c9 looks like "Nachtmix_Nachtmix_27.08.2020_23:05" # agf_c9 looks like "Zündfunk_Zündfunk_27.08.2020_19:05" or "Zündfunk_Zündfunk Generator_30.08.2020_22:05" or "Nachtmix_Nachtmix_27.08.2020_23:05"
# so it can be used to extract the episode's exact broadcast time # so it can be used to extract the episode's exact broadcast time
try: try:
parts = XMLmeta['agf_c9'].split('_') parts = XMLmeta['agf_c9'].split('_')
...@@ -202,6 +236,12 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -202,6 +236,12 @@ for bc in broadcastJson['channelBroadcasts']:
print ("ERROR: Could not download %s" % url, file=sys.stderr) print ("ERROR: Could not download %s" % url, file=sys.stderr)
sys.exit(1) sys.exit(1)
# get playlist
playlist_text = get_playlist_as_text(meta['broadcastDate_dt'])
if playlist_text:
meta['playlist_text'] = "PLAYLIST: " + playlist_text
# set ID3 tag # set ID3 tag
try: try:
tag = ID3(tmpFile) tag = ID3(tmpFile)
...@@ -215,7 +255,7 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -215,7 +255,7 @@ for bc in broadcastJson['channelBroadcasts']:
tag.add(TRCK(text=["1/1"])) tag.add(TRCK(text=["1/1"]))
#tag.add(TIT2(text=[meta['broadcastDate_dt'].strftime("%Y-%m-%d") + ": "+XMLmeta['title']])) #tag.add(TIT2(text=[meta['broadcastDate_dt'].strftime("%Y-%m-%d") + ": "+XMLmeta['title']]))
tag.add(TIT2(text=[XMLmeta['title']])) tag.add(TIT2(text=[XMLmeta['title']]))
tag.add(COMM(lang="deu", desc="desc", text=[XMLmeta['desc']])) tag.add(COMM(lang="deu", desc="desc", text=[ " /// ".join(filter(None, [XMLmeta['desc'], meta['playlist_text']]))]))
tag.add(TYER(text=[meta['broadcastDate_dt'].strftime("%Y")])) tag.add(TYER(text=[meta['broadcastDate_dt'].strftime("%Y")]))
tag.add(TDAT(text=[meta['broadcastDate_dt'].strftime("%d%m")])) tag.add(TDAT(text=[meta['broadcastDate_dt'].strftime("%d%m")]))
tag.add(TIME(text=[meta['broadcastDate_dt'].strftime("%H%M")])) tag.add(TIME(text=[meta['broadcastDate_dt'].strftime("%H%M")]))
......
...@@ -15,8 +15,12 @@ import lxml ...@@ -15,8 +15,12 @@ import lxml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
#import pprint #import pprint
baseUrl="https://www.br.de/radio/bayern2/sendungen/zuendfunk/programm-nachhoeren/index.html" baseUrl="https://www.br.de/radio/bayern2/sendungen/zuendfunk/programm-nachhoeren/index.html"
playlistsBaseUrl="https://www.br.de/radio/bayern2/sendungen/zuendfunk/pop-platten/playlisten/index.html" playlistsBaseUrl="https://www.br.de/radio/bayern2/sendungen/zuendfunk/pop-platten/playlisten/index.html"
minimalEpisodeDuration_ms = 45 * 60 * 1000
showTitle = "Zündfunk"
def download(url: str, attempts=4): def download(url: str, attempts=4):
tmpfile = NamedTemporaryFile(delete=False) tmpfile = NamedTemporaryFile(delete=False)
...@@ -32,20 +36,6 @@ def download(url: str, attempts=4): ...@@ -32,20 +36,6 @@ def download(url: str, attempts=4):
pass pass
return None return None
if len(sys.argv) != 2:
print("Usage:", file=sys.stderr)
print("%s <DownloadDir>\n" % sys.argv[0], file=sys.stderr)
print("Example:", file=sys.stderr)
print("%s 'Downloads/Zündfunk Recordings'\n" % sys.argv[0], file=sys.stderr)
sys.exit(1)
DESTDIR = sys.argv[1]
if not os.path.isdir(DESTDIR):
print("Directory %s does not exist!" % DESTDIR, file=sys.stderr)
sys.exit(1)
def time2seconds(timestr: str): def time2seconds(timestr: str):
# return duration of HH:MM:SS in seconds # return duration of HH:MM:SS in seconds
parts = re.split(":", timestr) parts = re.split(":", timestr)
...@@ -71,7 +61,7 @@ def get_playlist_as_text(dt: datetime): ...@@ -71,7 +61,7 @@ def get_playlist_as_text(dt: datetime):
# follow link to playlist # follow link to playlist
html = requests.get(dayUrl, timeout=5).text html = requests.get(dayUrl, timeout=5).text
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
plsLink = soup.find('a', class_=re.compile("^playlist\-\d+$"), href=re.compile('.+playlist\-\d+.html$'))['href'] plsLink = soup.find('a', class_=re.compile("^playlist(\-"+showTitle.lower()+")?\-\d+$"), href=re.compile('.+playlist(\-'+showTitle.lower()+')?\-\d+.html$'))['href']
plsUrl = urllib.parse.urljoin(playlistsBaseUrl, plsLink) plsUrl = urllib.parse.urljoin(playlistsBaseUrl, plsLink)
# read playlist # read playlist
...@@ -87,6 +77,20 @@ def get_playlist_as_text(dt: datetime): ...@@ -87,6 +77,20 @@ def get_playlist_as_text(dt: datetime):
return None return None
if len(sys.argv) != 2:
print("Usage:", file=sys.stderr)
print("%s <DownloadDir>\n" % sys.argv[0], file=sys.stderr)
print("Example:", file=sys.stderr)
print("%s 'Downloads/%s Recordings'\n" % (sys.argv[0], showTitle), file=sys.stderr)
sys.exit(1)
DESTDIR = sys.argv[1]
if not os.path.isdir(DESTDIR):
print("Directory %s does not exist!" % DESTDIR, file=sys.stderr)
sys.exit(1)
html = requests.get(baseUrl, timeout=5).text html = requests.get(baseUrl, timeout=5).text
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
...@@ -156,7 +160,7 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -156,7 +160,7 @@ for bc in broadcastJson['channelBroadcasts']:
# extract metadata from XML with longest audio # extract metadata from XML with longest audio
XMLmeta = { XMLmeta = {
'topline': safe_text_get(xmls[0].xpath("./audio/topline"),0), 'topline': safe_text_get(xmls[0].xpath("./audio/topline"),0),
'title': safe_text_get(xmls[0].xpath("./audio/title"),0), 'title': re.sub("^Jetzt nachhören: ","", safe_text_get(xmls[0].xpath("./audio/title"),0)),
'shareTitle': safe_text_get(xmls[0].xpath("./audio/shareTitle"),0), 'shareTitle': safe_text_get(xmls[0].xpath("./audio/shareTitle"),0),
'duration': safe_text_get(xmls[0].xpath("./audio/duration"),0), 'duration': safe_text_get(xmls[0].xpath("./audio/duration"),0),
'channel': safe_text_get(xmls[0].xpath("./audio/channel"),0,"BAYERN 2"), 'channel': safe_text_get(xmls[0].xpath("./audio/channel"),0,"BAYERN 2"),
...@@ -184,11 +188,11 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -184,11 +188,11 @@ for bc in broadcastJson['channelBroadcasts']:
} }
## Filter out some episodes ## Filter out some episodes
# I know that a real Zündfunk episode is longer than 45 minutes. Skip this episode if it is shorter # Skip this episode if it is shorter than defined minimal duration
if meta['duration_ms'] < 45 * 60 * 1000: if meta['duration_ms'] < minimalEpisodeDuration_ms:
continue continue
# Skip all non "Zündfunk" broadcasts like "Zündfunk Generator" (which has an official podcast feed) # Skip this episode if "Broadcast" is not matching the show's title
if XMLmeta['broadcast'].lower() != 'zündfunk': if XMLmeta['broadcast'].lower() != showTitle.lower():
continue continue
...@@ -207,7 +211,7 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -207,7 +211,7 @@ for bc in broadcastJson['channelBroadcasts']:
## Populate values in "meta" dict ## Populate values in "meta" dict
# agf_c9 looks like "Zündfunk_Zündfunk_27.08.2020_19:05" or "Zündfunk_Zündfunk Generator_30.08.2020_22:05" # agf_c9 looks like "Zündfunk_Zündfunk_27.08.2020_19:05" or "Zündfunk_Zündfunk Generator_30.08.2020_22:05" or "Nachtmix_Nachtmix_27.08.2020_23:05"
# so it can be used to extract the episode's exact broadcast time # so it can be used to extract the episode's exact broadcast time
try: try:
parts = XMLmeta['agf_c9'].split('_') parts = XMLmeta['agf_c9'].split('_')
...@@ -279,5 +283,5 @@ for bc in broadcastJson['channelBroadcasts']: ...@@ -279,5 +283,5 @@ for bc in broadcastJson['channelBroadcasts']:
# done # done
shutil.move(tmpFile, meta['filepath']) shutil.move(tmpFile, meta['filepath'])
os.chmod(meta['filepath'], 0o644) os.chmod(meta['filepath'], 0o644)
print("done.", flush=True) print("done.", flush=True)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment