Commit 4c5188b6 authored by Bernhard Geier's avatar Bernhard Geier

Rewrite in Python 3

Use FM4-provided metadata to fill ID3 tags
parent 38f85baf
# FM4 7-Tage Backup
The Austrian radio station FM4 publishes MP3 recordings of all of their shows during the last 7 days on its website.
This script is a simple command line interface for their player API, and allows you to download all currently available recordings for a specific show.
Example:
This Python 3 script is a simple command line tool to download all currently available recordings for a specific show.
### Requirements
Python 3 with modules "mutagen", "urllib3" and "requests".
(On Debian/Ubuntu: `sudo apt install python3 python3-mutagen python3-urllib3 python3-requests`)
### Usage
```./fm4-7tage-download.py <ShowTitle> <TargetDirectory>```
The script searches in FM4's 7-Tage-Player API for shows with a matching name and downloads them into the given target directory.
Files aready present get skipped, so it is well suited for cron jobs.
The show's metadata gets stored in the downloaded MP3 file's ID3 tags (see below).
If a show's recording is split into multiple parts (e.g. "Morning Show"), the script will download all parts and name them accordingy ("FM4 Morning Show 2020-09-03 06_00 **[1_5]**.mp3, FM4 Morning Show 2020-09-03 06_00 **[2_5]**.mp3, ...).
**Example:**
```./fm4-7tage-download.py "morning show" Downloads/Morning-Show-Recordings```
This would download all available recordings of "Morning Show" and save them with correct ID3 tags in the "Downloads/Morning-Show-Recordings" directory.
## ID3 Tags
The show's metadata is used **extensively** to set the ID3v2.3 tags for downloaded recordings.
```./fm4-7tage-download.pl "morning show" Downloads/Morning-Show-Recordings```
**Example:**
This would download all available recordings of "Morning Show" and save them in the "Downloads/Morning-Show-Recordings" directory.
Recordings get correct id3 tags, files aready downloaded are skipped, so it should be well suited for cron jobs.
The downloaded file `Downloads/Morning-Show-Recordings/FM4 Morning Show 2020-09-03 06_00 [1_5].mp3` gets this ID3 tags:
```
TPE1 (Lead performer(s)/Soloist(s)): FM4
TALB (Album/Movie/Show title): Morning Show
TIT2 (Title/songname/content description): 2020-09-03 06:00 [1/5]
TRCK (Track number/Position in set): 1/5
TLEN (Length): 00:26:33
TDAT (Date): 0309
TIME (Time): 0600
TYER (Year): 2020
APIC (Attached picture): (Front Cover)[, 3]: image/jpeg, 32580 bytes
COMM (Comments): (desc)[deu]: Die FM4 Morning Show mit Dave Dempsey und Christoph Sepin | Wir machen Urlaub
auf der schönsten Insel Österreichs, vergeben ein sehr rares Exemplar des FM4
Kalenders, erzählen euch alles über das Filmfestival Venedig und freuen uns,
dass unseren Austrian Act of the Day Strandhase zu Gast zu haben.
TRSN (Internet radio station name): FM4
WORS (Official internet radio station homepage): http://fm4.orf.at
WOAS (Official audio source webpage): http://fm4.orf.at/radio/stories/fm4morningshow
CHAP (Chapters):
Chapter #0: start 0.000000, end 171.000000
title : News
Chapter #1: start 170.000000, end 187.000000
title : ch2
Chapter #2: start 206.000000, end 423.000000
title : Kid Simius ft. Enda Gallery / Livin'It Up
Chapter #3: start 420.000000, end 656.000000
title : Tame Impala / Is It True
Chapter #4: start 652.000000, end 859.000000
title : Warpaint / New Song
Chapter #5: start 859.000000, end 1083.000000
title : FM4 Inselhüpfen: Die Wiener Donauinsel
Chapter #6: start 1079.000000, end 1324.000000
title : Der Nino Aus Wien / Praterlied / Live 29/08/2020 Alter Schalchthof Wels; 30/08 Sommerspiele
Perchtoldsdorf; 04/09 Aula Linz
Chapter #7: start 1323.000000, end 1368.000000
title : ch8
Chapter #8: start 1368.000000, end 1562.000000
title : DJ Shadow ft Run The Jewels / Nobody Speak / from the album 'The Mountain Will Fall' out
June 24, 2016 FM4 Soundselection 35, out November 11, 2016
```
#!/usr/bin/perl
use strict;
use warnings;
use WWW::Mechanize;
use URI::Encode;
use JSON;
use POSIX;
use HTML::Strip;
use File::Spec;
use MP3::Tag;
MP3::Tag->config(write_v24=>1);
use Unicode::String qw(utf8 latin1);
$|=1;
if (@ARGV ne 2) {
print "USAGE:\n";
print $0." <ShowTitle> <downloadDir>\n\n";
print "Example:\n";
print $0.' "Morning Show" "Downloads/Morning Show Recordings"'."\n";
exit;
}
my $SENDUNG=$ARGV[0];
my $DESTDIR=$ARGV[1];
die("Directory ".$DESTDIR." does not exist!\n") unless (-d $DESTDIR);
my $searchUrl="https://audioapi.orf.at/fm4/api/json/current/search?q=";
my $shoutcastBaseUrl="http://loopstream01.apa.at/?channel=fm4&id=";
my $browser=WWW::Mechanize->new(timeout=>5);
my $removeHtml=HTML::Strip->new();
$browser->get($searchUrl.URI::Encode::uri_encode($SENDUNG));
my $result=JSON->new()->utf8->decode($browser->content());
foreach (@{$result->{'hits'}}) {
$browser->get($_->{'data'}->{'href'});
my $data=JSON->new()->utf8->decode($browser->content());
my ($title)=$data->{'title'}=~/^\s*(.+?)\s*$/;
next unless $title=~/$SENDUNG/i; # Filter out results not containing the query string in the title
my $broadcastDate=POSIX::strftime("%Y-%m-%d %H:%M",localtime($data->{'start'}/1000));
my $description=$removeHtml->parse(($data->{'description'}) or $broadcastDate);
my @parts=(sort { $a->{'start'} cmp $b->{'start'}} @{$data->{'streams'}}); # for multi-part shows (e.g. "Morning Show") sort them by start time...
for (my $i=0; $i<@parts; $i++) {
my $tagTitle=$title." ".$broadcastDate;
$tagTitle.=" [".($i+1)."/".@parts."]" if @parts>1; # ...and add "[currentPartNo/totalParts]" to title
my $filename=$tagTitle.".mp3";
$filename=~s/[^\w\s\-\.\[\]]/_/g;
$filename="FM4 ".$filename unless $filename=~/^FM4/;
my $file=File::Spec->join($DESTDIR,$filename);
if (-f $file) {
print $filename." already exists, skipping.\n";
next;
}
print $filename." downloading... ";
my ($tries,@parameters,$FD);
$tries=4;
@parameters=(
$shoutcastBaseUrl.$parts[$i]->{'loopStreamId'}, # URL
":content_cb" => sub {
my ($chunk) = @_;
print $FD $chunk;
});
while ($tries) {
open($FD,">>".$file.".part");
my $bytes=-s $file.".part";
if ($bytes > 0) {
push(@parameters,"Range"=>"bytes=".$bytes."-");
}
my $result=$browser->get(@parameters);
close $FD;
last if ($result->is_success or $result->code == 416);
$tries--;
}
if ($tries eq 0) {
print "failed.\n";
next;
}
rename($file.".part",$file);
my $tag=MP3::Tag->new($file);
$tag->get_tags;
$tag->new_tag("ID3v2") unless (exists $tag->{ID3v2});
$tag->{ID3v2}->artist("FM4");
$tag->{ID3v2}->title($tagTitle);
$tag->{ID3v2}->comment($description);
$tag->{ID3v2}->write_tag;
print "done.\n";
}
}
#!/usr/bin/env python3
# TODO:
# - Gibt's in den Infos was wann welches Lied gespielt wurde? -> chapters!!
# - retries bei requests
# - https://gist.github.com/Foolson/1db5620023675e55594e3af44f69a70d
# - https://id3.org/id3v2.3.0
# - chapters in rss: https://gist.github.com/gglnx/5233635
# - argparser
import requests
import sys
import urllib.parse
import os
import re
from datetime import datetime
from mutagen.id3 import ID3,ID3NoHeaderError,TRSN,TPE1,TALB,TRCK,TIT2,COMM,TYER,TDAT,TIME,TLEN,TDRL,CTOC,CHAP,WOAS,WORS,APIC,CTOCFlags
searchUrl = "https://audioapi.orf.at/fm4/api/json/current/search?q=%s";
shoutcastBaseUrl = "http://loopstream01.apa.at/?channel=fm4&id=%s";
if len(sys.argv) != 3:
print("Usage:", file=sys.stderr)
print("%s <ShowTitle> <DownloadDir>\n" % sys.argv[0], file=sys.stderr)
print("Example:", file=sys.stderr)
print("%s 'Morning Show' 'Downloads/Morning Show Recordings'\n" % sys.argv[0], file=sys.stderr)
sys.exit(1)
SHOW = sys.argv[1]
DESTDIR = sys.argv[2]
if not os.path.isdir(DESTDIR):
print("Directory %s does not exist!" % DESTDIR, file=sys.stderr)
sys.exit(1)
# remove html tags
def strip_html(text: str):
if text is None:
return None
# add a comma after <br/>
text = re.sub('(<br/?>)', r'\1, ', text, flags=re.IGNORECASE)
# add a | between </p></p>
text = re.sub('\s*(</p>)\s*(<p>)\s*', r'\1 | \2 ', text, flags=re.IGNORECASE)
tag = False
quote = False
out = ""
for c in text:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
# remove multiple consecutive spaces
out = re.sub('\s\s+',' ',out)
return out
# download in chunks
def download(url: str, file_path: str, attempts=4):
for attempt in range(1, attempts+1):
try:
if attempt > 1:
time.sleep(3) # wait 3 seconds between download attempts
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(file_path, 'wb') as out_file:
for chunk in response.iter_content(chunk_size=1024*1024): # 1MB chunks
out_file.write(chunk)
return True # success
except Exception as ex:
return False
return None
# search for show
response = requests.get(searchUrl % urllib.parse.quote_plus(SHOW), timeout=5)
result = response.json()
# for each search result fetch linked data
for hit in result['hits']:
# only care about "Broadcast" and skip everything else
if hit['data']['entity'] != "Broadcast":
continue
# get json of matching broadcast
broadcastJson = requests.get(hit['data']['href'], timeout=5).json()
# extract show name. skip if results not containing the show's name in the title
match = re.search('^\s*(.*?'+SHOW+'.*?)\s*$',broadcastJson['title'],flags=re.IGNORECASE)
if not match:
continue
showName = match.group(1)
# extract start and end datetime
showStart = datetime.fromtimestamp(broadcastJson['start']/1000)
showEnd = datetime.fromtimestamp(broadcastJson['end']/1000)
# build show description
showDescription = strip_html(broadcastJson['description'])
if showDescription is None:
showDescription = strip_html(broadcastJson['subtitle'])
if showDescription is None:
showDescription = strip_html(broadcastJson['pressRelease'])
if showDescription is None:
showDescription = showStart.strftime("%Y-%m-%d %H:%M")
# most shows have one part in the stream, some shows (e.g. Morning Show) are split into multiple stream parts
# download them, sorted by start time
streams = sorted(broadcastJson['streams'], key=lambda x: x['start'])
for streamNr in range(0, len(streams)):
tagTitle = showStart.strftime("%Y-%m-%d %H:%M")
if len(streams)>1:
tagTitle += " [" + str(streamNr+1) + "/" + str(len(streams)) + "]"
filename = re.sub('[^\w\s\-\.\[\]]','_', showName + " " + tagTitle)
match = re.search('^FM4 ',filename)
if not match:
filename = "FM4 "+filename
filename+=".mp3"
filepath = os.path.join(DESTDIR, filename)
if os.path.isfile(filepath) and os.path.getsize(filepath)>0:
print("%s already exists, skipping." % filepath, flush=True)
continue
print("%s downloading..." % filepath, end=" ", flush=True)
if not download(shoutcastBaseUrl % broadcastJson['streams'][streamNr]['loopStreamId'], filepath+".part"):
print("failed.", flush=True)
continue
# set ID3 tag
try:
tags = ID3(filepath+".part")
tags.delete()
except ID3NoHeaderError:
tags = ID3()
tags.add(TRSN(text=["FM4"]))
tags.add(TPE1(text=["FM4"]))
tags.add(TALB(text=[showName]))
tags.add(TRCK(text=[str(streamNr+1) + "/" + str(len(streams))]))
tags.add(TIT2(text=[tagTitle]))
tags.add(COMM(lang="deu", desc="desc", text=[showDescription]))
tags.add(TYER(text=[showStart.strftime("%Y")]))
tags.add(TDAT(text=[showStart.strftime("%d%m")]))
tags.add(TIME(text=[showStart.strftime("%H%M")]))
tags.add(TLEN(text=[broadcastJson['streams'][streamNr]['end'] - broadcastJson['streams'][streamNr]['start']]))
tags.add(WOAS(url=broadcastJson['url']))
tags.add(WORS(url="http://fm4.orf.at"))
# set chapter information according to show's "items"
# https://mutagen.readthedocs.io/en/latest/user/id3.html
chapters = []
chapterNr = 0
for item in sorted(broadcastJson['items'], key=lambda x: x['start']):
if item['entity'] == "BroadcastItem":
if item['end'] <= broadcastJson['streams'][streamNr]['start']:
continue
if item['start'] >= broadcastJson['streams'][streamNr]['end']:
break
chapterNr+=1
chapterTitle = []
for key in [ "interpreter", "title", "description" ]:
if key in item.keys():
if item[key] is not None:
chapterTitle.append(strip_html(item[key]))
chapters.append({
"id": "ch"+str(chapterNr),
"title": " / ".join(chapterTitle),
"startTime": item['start']-broadcastJson['streams'][streamNr]['start'],
"endTime": item['end']-broadcastJson['streams'][streamNr]['start'] # FIXME: chapters (and shows?) seem to be 1s too long
})
for c in chapters:
tags.add(CHAP(
element_id = c["id"],
start_time = c["startTime"],
end_time = c["endTime"],
sub_frames = [TIT2(text=[c["title"]])]
))
tocList = ",".join([ c["id"] for c in chapters ])
tags.add(CTOC(
element_id = "toc",
flags = CTOCFlags.TOP_LEVEL | CTOCFlags.ORDERED,
child_element_ids = [tocList],
sub_frames = [TIT2(text=["Table Of Contents"])]
))
# cover image
for i in range(2,-1,-1):
try:
response = requests.get(broadcastJson['images'][0]['versions'][i]['path'])
if response.status_code == 200:
tags.add(APIC(mime=response.headers['content-type'], desc="Front Cover", data=response.content))
break
except:
continue
# save ID3 tags
tags.save(filepath+".part",v2_version=3)
# done
os.rename(filepath+".part", filepath)
print("done.", flush=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment