Commit 7b74efb2 authored by Bernhard Geier's avatar Bernhard Geier
Browse files

adapt new API

parent f9687320
......@@ -6,27 +6,29 @@ This is a Python 3 command line script to download shows from any channel from B
The shows are saved in MP3 format and get tagged with all available information, including chapter markers.
### Requirements
Python 3 with modules "pydub", "mutagen", "beautifulsoup4" and "requests".
(On Debian/Ubuntu: `sudo apt install python3 python3-mutagen python3-requests python3-bs4 pydub`)
Python 3 with modules "pydub", "mutagen" and "requests".
(On Debian/Ubuntu: `sudo apt install python3 python3-mutagen python3-requests pydub`)
### Usage
```./br-download.py <Channel> <Show> <TargetDirectory>```
* `TargetDirectory` is the directory you want the MP3 files get saved in
* `Show` is the show's title as displayed in BR's "Live" (player https://www.br.de/radio/live/)
* `Channel` can be something like "bayern2", "br-klassik", "br24", "puls", as seen in the URL of the "Live" player.
* `Show` is the show's title as displayed in BR's "Live" player (https://www.br.de/radio/live/)
* `Channel` can be something like "bayern 2", "br-klassik", "br24", "puls". If an invalid Channel is given, all valid channel names get displayed.
`Show` and `Channel` are case insensitive. \
Episodes aready downloaded get skipped, so this script is well suited for cron jobs.
**Example:**
```./br-download.py bayern2 "IQ - Wissenschaft und forschung" "/data/aufnahmen```
```./br-download.py "bayern 2" "IQ - Wissenschaft und forschung" "/data/recordings```
This would download all available "IQ - Wissenschaft und Forschung" episodes from Bayern 2 and save them with full ID3 tags in the "/data/aufnahmen" directory.
This would download all available "IQ - Wissenschaft und Forschung" episodes from Bayern 2 and save them with full ID3 tags in the "/data/recordings" directory.
### Limitations
* As of January 2021 Bayerischer Rundfunk only offers the last 5 hours of its program as recordings, not the last 7 days
* Timestamps are way off. This means shows start earlier or later than expected and chapter markers are wrong. As it's the same on Bayerischer Rundfunk's "Live" web page it's most likely their fault.
* Shows aired very recently can't get downloaded. \
While all shows look the same on Bayerischer Rundfunk's website, the most recent shows usually have wrong cut marks, which means they start several minutes too early or too late and the chapter markers are wrong.
Some minutes/hours after the show's end Bayerischer Rundfunk fixes all this and moves the show internally from "live stream" to the "relive".
This script downloads shows only from "relive".
### See also
If you want to listen to the downloaded shows with your podcast player: https://github.com/citronalco/mp3-to-rss2feed creates a RSS2 feed from MP3 files.
......
......@@ -10,132 +10,118 @@ from dateutil.parser import parse
import pytz
from mutagen.id3 import ID3,ID3NoHeaderError,TRSN,TPE1,TALB,TRCK,TIT2,COMM,TYER,TDAT,TIME,TLEN,CTOC,CHAP,WOAS,WORS,APIC,CTOCFlags
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
from io import BytesIO
from pydub import AudioSegment
import argparse
parser = argparse.ArgumentParser(
description = "Find all availabe recordings of a show in Bayerischer Runfunk's player, download them as MP3 files and save the shows' metadata in the ID3 tags.",
)
parser.add_argument("Channel", help="The channel's name (e.g. \"Bayern2\", \"BR-Klassik\", \"Puls\")")
parser.add_argument("ShowTitle", help="The show's title (e.g. \"Zündfunk\")")
parser.add_argument("Directory", help="The directory to save the files in (e.g. \"Downloads/Zündfunk Recordings\")")
args = parser.parse_args()
CHANNEL = args.Channel
SHOW = args.ShowTitle
DESTDIR = args.Directory
if not os.path.isdir(DESTDIR):
print("Directory %s does not exist!" % DESTDIR, file=sys.stderr)
sys.exit(1)
baseUrl="https://www.br.de/radio/live/%s/programm/" % CHANNEL.lower()
# Fetch program information of the current day and fetch M3U8 data
day = date.today()
try:
html = requests.get(baseUrl + '/' + day.strftime("%Y-%m-%d") + '/', timeout=5).text
# extract JSON data embedded into HTML page
soup = BeautifulSoup(html, 'lxml')
jsonData = json.loads(soup.find('script', id='__NEXT_DATA__').encode_contents())
# get M3U8 with paths to media streams
streamsM3U8url = jsonData['props']['pageProps']['stationData']['audioBroadcastService']['sophoraLivestreamDocuments'][0]['streamingUrl']
streamsM3U8 = requests.get(streamsM3U8url).text
# retrieve all media stream paths from M3U8
streams = re.findall(r'^(?!#)(.*)\n', streamsM3U8, re.MULTILINE)
# get M3U8 with TS paths from media stream (streams are sorted by bitrate, last one has the highest)
tsBaseUrl = re.sub(r'([^\/]+?)\/?$','', streamsM3U8url)
tsM3U8 = requests.get(tsBaseUrl + streams[-1]).text
except:
print("Error: Could fetch download program information from %s" % baseUrl + '/' + day.strftime("%Y-%m-%d") + '/', file=sys.stderr)
exit(1)
# retrieve information about TS stream from M3U8
tsData = {
# name of the first TS snippet
'mediaSequence': int(re.search(r'^#EXT-X-MEDIA-SEQUENCE:\s*(\d+)$', tsM3U8, re.MULTILINE).group(1)),
# duration of each TS snippet
'targetDuration': int(re.search(r'^#EXT-X-TARGETDURATION:\s*(\d+)$', tsM3U8, re.MULTILINE).group(1)),
# datetime of oldest TS snippet
'programDateTime': parse(re.search(r'^#EXT-X-PROGRAM-DATE-TIME:\s*(.+)$', tsM3U8, re.MULTILINE).group(1)),
# URLs to all TS snippets
'segments': list(map(lambda x: tsBaseUrl + x, re.findall(r'^(?!#)(.*)\n', tsM3U8, re.MULTILINE)))
}
# search for broadcasts of requested show
foundBroadcasts = []
while True:
# loop broadcasts from new to old
for broadcast in reversed(jsonData['props']['pageProps']['stationDayProgramData']['audioBroadcastService']['epg']):
# stop on any broadcast too dated
if parse(broadcast['broadcastEvent']['start']) < tsData['programDateTime']:
break
# skip broadcasts not having ended yet
if parse(broadcast['broadcastEvent']['end']) > datetime.now(tz=pytz.timezone('Europe/Berlin')):
# Some basic URLs discovered within browser
audioBroadcastServicesUrl="https://brradio.br.de/radio/v4?query=query broadcastServices{audioBroadcastServices{trackingInfos{pageVars}nodes{id dvbServiceId name slug logo(type:SQUARE){url}logoSVG:logo(type:SQUARE,format:SVG){url}url sophoraLivestreamDocuments{sophoraId streamingUrl title reliveUrl trackingInfos{mediaVars}}}}}"
epgUrl='https://brradio.br.de/radio/v4?query=query broadcastDayProgram($stationSlug:String!,$day:MangoDay){audioBroadcastService(slug:$stationSlug){... on AudioBroadcastService{epg(day:$day){broadcastEvent{id start end trackingInfos{pageVars mediaVars}items{guid start duration class title ... on NewsElement{author}... on MusicElement{performer composer}}excludedTimeRanges{start end}isSeekableNews publicationOf{id kicker title description defaultTeaserImage{url}... on MangoProgramme{canonicalUrl title kicker}}}}}}}&variables={"stationSlug":"%s","day":"%s"}'
# Note:
# New broadcasts are available under streamingUrl, but with wrong timestamps, and start and end are mostly cut wrong.
# After a few hours the are available corrected under reliveUrl.
# So we only care about reliveUrl
def getSegmentUrls(startDT,endDT,reliveUrlTemplate):
segmentsList = []
segmentsListEnd = startDT
while segmentsListEnd < endDT:
reliveStartDT = segmentsListEnd.replace(minute=0)
# fill placeholders in reliveUrl (playlists always start at full hour and last a full hour)
reliveUrl = str(reliveUrlTemplate)
reliveUrl = reliveUrl.replace('{yMd}', reliveStartDT.astimezone(pytz.timezone('Europe/Berlin')).strftime('%Y%m%d'))
reliveUrl = reliveUrl.replace('{H}', reliveStartDT.astimezone(pytz.timezone('Europe/Berlin')).strftime('%H'))
reliveUrl = reliveUrl.replace("+{Z}00", reliveStartDT.astimezone(pytz.timezone('Europe/Berlin')).strftime('%z'))
# fetch M3U playlist from reliveUrl
relivePlaylist = requests.get(reliveUrl).text
# this first playlist only contains names of the real playlists (aka media streams)
mediaStreams = re.findall(r'^(?!#)(.+)\n', relivePlaylist, re.MULTILINE)
# media streams and TS snippets have relative URLs to the reliveUrl
reliveBaseUrl = re.sub(r'([^\/]+?)\/?$','', reliveUrl)
# get real M3U8 playlist with TS paths from the last media stream (streams are sorted by bitrate, last one has the highest)
playlistUrl = reliveBaseUrl + mediaStreams[-1]
response = requests.get(playlistUrl)
if response.status_code == 404:
# show not yet in relive, so skip it
return None
tsM3U8 = response.text
# retrieve information about TS stream from m38u
tsData = {
# duration of each TS snippet
'targetDuration': int(re.search(r'^#EXT-X-TARGETDURATION:\s*(\d+)$', tsM3U8, re.MULTILINE).group(1)),
# URLs to all TS snippets
'segments': list(map(lambda x: reliveBaseUrl + x, re.findall(r'^(?!#)(.*)\n', tsM3U8, re.MULTILINE)))
}
try:
# datetime of oldest TS snippet (missing in relive playlists)
tsData['programDateTime'] = parse(re.search(r'^#EXT-X-PROGRAM-DATE-TIME:\s*(.+)$', tsM3U8, re.MULTILINE).group(1))
except AttributeError:
# relive playlists always start at full hour
tsData['programDateTime'] = reliveStartDT.replace(minute=0)
# calculate first TS snippets for this broadcast in this relive playlist
ts_first = floor((startDT - tsData['programDateTime']).total_seconds() / tsData['targetDuration'])
if ts_first < 0:
ts_first = 0
segIdx = ts_first
try:
while segmentsListEnd < endDT:
segmentsList.append(tsData['segments'][segIdx])
segmentsListEnd = segmentsListEnd + timedelta(seconds=tsData['targetDuration'])
segIdx+=1
except IndexError:
continue
match = re.search('^\s*' + SHOW + '\s*$', broadcast['broadcastEvent']['trackingInfos']['pageVars']['topline'], flags=re.IGNORECASE)
if match:
foundBroadcasts.append(broadcast['broadcastEvent'])
return segmentsList
else:
# no "break" happened above? -> get data of previous day and continue searching!
day = day - timedelta(days = 1)
html = requests.get(baseUrl + '/' + day.strftime("%Y-%m-%d") + '/', timeout=5).text
soup = BeautifulSoup(html, 'lxml')
jsonData = json.loads(soup.find('script', id='__NEXT_DATA__').encode_contents())
continue
# broadcasts are too dated already ("break" happened above), don't go further in the past
break
# download broadcasts, from old to new
for broadcast in reversed(foundBroadcasts):
# download and tag a broadcast
def download(broadcast, targetDir, reliveUrlTemplate):
broadcastStartDT = parse(broadcast['start'])
broadcastEndDT = parse(broadcast['end'])
# build filename from channel, show title and broadcast datetime, while escaping "bad" characters
filename = os.path.join(
DESTDIR,
targetDir,
re.sub(
'[^\w\s\-\.\[\]]', '_',
broadcast['trackingInfos']['pageVars']['broadcast_service'] + ' ' + broadcastStartDT.astimezone(pytz.timezone('Europe/Berlin')).strftime("%Y-%m-%d %H:%M") + ' ' + broadcast['trackingInfos']['pageVars']['topline']
) + ".mp3"
)
# skip broadcast if file is already exists
if os.path.isfile(filename) and os.path.getsize(filename)>0:
print("%s already exists, skipping." % filename, flush=True)
continue
return
# calculate TS snippets for this broadcast
ts_first = floor( (broadcastStartDT - tsData['programDateTime']).total_seconds() / tsData['targetDuration'])
ts_last = ceil( (broadcastEndDT - tsData['programDateTime']).total_seconds() / tsData['targetDuration'])
# get links to all audio segments of this broadcast
segmentUrls = getSegmentUrls(broadcastStartDT, broadcastEndDT, reliveUrlTemplate)
if segmentUrls is None:
# skip broadcast if no segments available
print("Skipping %s, not yet in relive" % filename)
return
# dowload all ts segments, and convert them to mp3
print("Downloading %s ..." % filename, end=" ", flush=True)
try:
sound = AudioSegment.empty()
for i in range(ts_first, ts_last):
sound += AudioSegment.from_file(BytesIO(urlopen(tsData['segments'][i]).read()))
for i in segmentUrls:
sound += AudioSegment.from_file(BytesIO(urlopen(i).read()))
sound.export(filename, format="mp3")
except:
print("failed.", flush=True)
continue
return
else:
print("done.", flush=True)
......@@ -159,7 +145,7 @@ for broadcast in reversed(foundBroadcasts):
tags.add(TIME(text=[broadcastStartDT.astimezone(pytz.timezone('Europe/Berlin')).strftime("%H%M")]))
tags.add(TLEN(text=[int((broadcastEndDT - broadcastStartDT).total_seconds() * 1000)]))
tags.add(WOAS(url=broadcast['publicationOf']['canonicalUrl']))
tags.add(WORS(url=baseUrl))
tags.add(WORS(url="https://www.br.de/radio/"))
# ID3: chapters
chapterNr = 0
......@@ -189,7 +175,6 @@ for broadcast in reversed(foundBroadcasts):
))
chapterNr += 1
tocList = ",".join([ str(i) for i in range(0,chapterNr) ])
tags.add(CTOC(
......@@ -200,11 +185,74 @@ for broadcast in reversed(foundBroadcasts):
))
# ID3: cover image
response = requests.get(broadcast['publicationOf']['defaultTeaserImage']['url'])
response = requests.get(broadcast['publicationOf']['defaultTeaserImage']['url'], timeout=5)
if response.status_code == 200:
tags.add(APIC(mime=response.headers['content-type'], desc="Front Cover", data=response.content))
# save ID3 tags
tags.save(filename,v2_version=3)
exit()
def main():
parser = argparse.ArgumentParser(
description = "Find all availabe recordings of a show in Bayerischer Runfunk's player, download them as MP3 files and save the shows' metadata in the ID3 tags.",
)
parser.add_argument("Channel", help="The channel's name (e.g. \"bayern 2\", \"BR-Klassik\", \"Puls\")")
parser.add_argument("ShowTitle", help="The show's title (e.g. \"Zündfunk\")")
parser.add_argument("Directory", help="The directory to save the files in (e.g. \"Downloads/Zündfunk Recordings\")")
args = parser.parse_args()
channelName = args.Channel
show = args.ShowTitle
targetDir = args.Directory
# check if targetDir exists
if not os.path.isdir(targetDir):
print("Directory %s does not exist!" % targetDir, file=sys.stderr)
sys.exit(1)
# get reliveUrlTemplate and slug for given station name
audioBroadcastServicesPage=requests.get(audioBroadcastServicesUrl, timeout=5).text
audioBroadcastServicesJson=json.loads(audioBroadcastServicesPage)
channelList = []
for node in audioBroadcastServicesJson['data']['audioBroadcastServices']['nodes']:
channelList.append(node['name'].lower())
if channelName.lower() == node['name'].lower():
reliveUrlTemplate = node['sophoraLivestreamDocuments'][0]['reliveUrl']
slug = node['slug']
break
# if we have not found a reliveUrlTemplate, most likely the given channel name is wrong.
# So display the list of available channel names
try:
reliveUrlTemplate
except NameError:
print("Channel %s not found!" % channelName, file=sys.stderr)
print("Valid channels are: %s" % ", ".join(channelList))
sys.exit(1)
# Loop through last week, starting from today
today = date.today()
for dateDelta in range(0,8):
thisDay = today - timedelta(days = dateDelta)
# fetch EPG of this day
try:
epgPage = requests.get(epgUrl % (slug, thisDay.strftime("%Y-%m-%d")), timeout=5).text
epgData = json.loads(epgPage)
except:
print("Error: Could not download program information for %s. Please try again later." % thisDay.strftime("%Y-%m-%d"), file=sys.stderr)
continue
# search in EPG for broadcasts of requested show
for broadcast in reversed(epgData['data']['audioBroadcastService']['epg']):
# skip broadcasts not having ended yet
if parse(broadcast['broadcastEvent']['end']) > datetime.now(tz=pytz.timezone('Europe/Berlin')):
continue
match = re.search('^\s*' + show + '\s*$', broadcast['broadcastEvent']['trackingInfos']['pageVars']['topline'], flags=re.IGNORECASE)
if match:
download(broadcast['broadcastEvent'], targetDir, reliveUrlTemplate)
if __name__ == "__main__":
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment