Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Citronalco
br-download
Commits
03a8accc
Commit
03a8accc
authored
Sep 15, 2020
by
Bernhard Geier
Browse files
add playlist from BR's website to comments
make both scripts more similar
parent
c09f06c1
Changes
2
Show whitespace changes
Inline
Side-by-side
nachtmix-download.py
View file @
03a8accc
...
...
@@ -15,7 +15,12 @@ import lxml
from
bs4
import
BeautifulSoup
#import pprint
baseUrl
=
"https://www.br.de/radio/bayern2/sendungen/nachtmix/index.html"
;
baseUrl
=
"https://www.br.de/radio/bayern2/sendungen/nachtmix/index.html"
playlistsBaseUrl
=
"https://www.br.de/radio/bayern2/sendungen/nachtmix/playlisten/index.html"
minimalEpisodeDuration_ms
=
45
*
60
*
1000
showTitle
=
"Nachtmix"
def
download
(
url
:
str
,
attempts
=
4
):
tmpfile
=
NamedTemporaryFile
(
delete
=
False
)
...
...
@@ -31,20 +36,6 @@ def download(url: str, attempts=4):
pass
return
None
if
len
(
sys
.
argv
)
!=
2
:
print
(
"Usage:"
,
file
=
sys
.
stderr
)
print
(
"%s <DownloadDir>
\n
"
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
print
(
"Example:"
,
file
=
sys
.
stderr
)
print
(
"%s 'Downloads/Nachtmix Recordings'
\n
"
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
DESTDIR
=
sys
.
argv
[
1
]
if
not
os
.
path
.
isdir
(
DESTDIR
):
print
(
"Directory %s does not exist!"
%
DESTDIR
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
def
time2seconds
(
timestr
:
str
):
# return duration of HH:MM:SS in seconds
parts
=
re
.
split
(
":"
,
timestr
)
...
...
@@ -57,6 +48,48 @@ def safe_text_get(l: list, idx: int, default=None):
except
IndexError
:
return
default
def
get_playlist_as_text
(
dt
:
datetime
):
try
:
# get website with calender entries with all available playlists
html
=
requests
.
get
(
playlistsBaseUrl
,
timeout
=
5
).
text
soup
=
BeautifulSoup
(
html
,
'lxml'
)
# select day
dayLink
=
soup
.
find
(
'a'
,
class_
=
re
.
compile
(
'^playlisten.+'
),
href
=
re
.
compile
(
'.+_date\-'
+
dt
.
strftime
(
"%Y"
)
+
'\-'
+
dt
.
strftime
(
"%m"
)
+
'\-'
+
dt
.
strftime
(
"%d"
)
+
'_.+\.html$'
))[
'href'
]
dayUrl
=
urllib
.
parse
.
urljoin
(
playlistsBaseUrl
,
dayLink
)
# follow link to playlist
html
=
requests
.
get
(
dayUrl
,
timeout
=
5
).
text
soup
=
BeautifulSoup
(
html
,
'lxml'
)
plsLink
=
soup
.
find
(
'a'
,
class_
=
re
.
compile
(
"^playlist(\-"
+
showTitle
.
lower
()
+
")?\-\d+$"
),
href
=
re
.
compile
(
'.+playlist(\-'
+
showTitle
.
lower
()
+
')?\-\d+.html$'
))[
'href'
]
plsUrl
=
urllib
.
parse
.
urljoin
(
playlistsBaseUrl
,
plsLink
)
# read playlist
html
=
requests
.
get
(
plsUrl
,
timeout
=
5
).
text
soup
=
BeautifulSoup
(
html
,
'lxml'
)
playlistEntries
=
[]
for
entry
in
soup
.
select
(
'div.detail_content > p.copytext'
):
playlistEntries
.
append
(
" - "
.
join
(
entry
.
find_all
(
text
=
True
)))
return
(
" | "
.
join
(
playlistEntries
))
except
:
return
None
if
len
(
sys
.
argv
)
!=
2
:
print
(
"Usage:"
,
file
=
sys
.
stderr
)
print
(
"%s <DownloadDir>
\n
"
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
print
(
"Example:"
,
file
=
sys
.
stderr
)
print
(
"%s 'Downloads/%s Recordings'
\n
"
%
(
sys
.
argv
[
0
],
showTitle
),
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
DESTDIR
=
sys
.
argv
[
1
]
if
not
os
.
path
.
isdir
(
DESTDIR
):
print
(
"Directory %s does not exist!"
%
DESTDIR
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
html
=
requests
.
get
(
baseUrl
,
timeout
=
5
).
text
soup
=
BeautifulSoup
(
html
,
'lxml'
)
...
...
@@ -151,14 +184,15 @@ for bc in broadcastJson['channelBroadcasts']:
'filename'
:
None
,
'filepath'
:
None
,
'duration_ms'
:
time2seconds
(
XMLmeta
[
'duration'
])
*
1000
,
'playlist_text'
:
None
,
}
## Filter out some episodes
#
I know that a real Nachtmix episode is longer than 45 minutes.
Skip this episode if it is shorter
if
meta
[
'duration_ms'
]
<
45
*
60
*
1000
:
# Skip this episode if it is shorter
than defined minimal duration
if
meta
[
'duration_ms'
]
<
minimalEpisodeDuration_ms
:
continue
# Skip
all non "Nachtmix" broadcasts
if
XMLmeta
[
'broadcast'
].
lower
()
!=
'nachtmix'
:
# Skip
this episode if "Broadcast" is not matching the show's title
if
XMLmeta
[
'broadcast'
].
lower
()
!=
showTitle
.
lower
()
:
continue
...
...
@@ -177,7 +211,7 @@ for bc in broadcastJson['channelBroadcasts']:
## Populate values in "meta" dict
# agf_c9 looks like "Nachtmix_Nachtmix_27.08.2020_23:05"
# agf_c9 looks like
"Zündfunk_Zündfunk_27.08.2020_19:05" or "Zündfunk_Zündfunk Generator_30.08.2020_22:05" or
"Nachtmix_Nachtmix_27.08.2020_23:05"
# so it can be used to extract the episode's exact broadcast time
try
:
parts
=
XMLmeta
[
'agf_c9'
].
split
(
'_'
)
...
...
@@ -202,6 +236,12 @@ for bc in broadcastJson['channelBroadcasts']:
print
(
"ERROR: Could not download %s"
%
url
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
# get playlist
playlist_text
=
get_playlist_as_text
(
meta
[
'broadcastDate_dt'
])
if
playlist_text
:
meta
[
'playlist_text'
]
=
"PLAYLIST: "
+
playlist_text
# set ID3 tag
try
:
tag
=
ID3
(
tmpFile
)
...
...
@@ -215,7 +255,7 @@ for bc in broadcastJson['channelBroadcasts']:
tag
.
add
(
TRCK
(
text
=
[
"1/1"
]))
#tag.add(TIT2(text=[meta['broadcastDate_dt'].strftime("%Y-%m-%d") + ": "+XMLmeta['title']]))
tag
.
add
(
TIT2
(
text
=
[
XMLmeta
[
'title'
]]))
tag
.
add
(
COMM
(
lang
=
"deu"
,
desc
=
"desc"
,
text
=
[
XMLmeta
[
'desc'
]
]))
tag
.
add
(
COMM
(
lang
=
"deu"
,
desc
=
"desc"
,
text
=
[
" /// "
.
join
(
filter
(
None
,
[
XMLmeta
[
'desc'
],
meta
[
'playlist_text'
]]))
]))
tag
.
add
(
TYER
(
text
=
[
meta
[
'broadcastDate_dt'
].
strftime
(
"%Y"
)]))
tag
.
add
(
TDAT
(
text
=
[
meta
[
'broadcastDate_dt'
].
strftime
(
"%d%m"
)]))
tag
.
add
(
TIME
(
text
=
[
meta
[
'broadcastDate_dt'
].
strftime
(
"%H%M"
)]))
...
...
zuendfunk-download.py
View file @
03a8accc
...
...
@@ -15,8 +15,12 @@ import lxml
from
bs4
import
BeautifulSoup
#import pprint
baseUrl
=
"https://www.br.de/radio/bayern2/sendungen/zuendfunk/programm-nachhoeren/index.html"
playlistsBaseUrl
=
"https://www.br.de/radio/bayern2/sendungen/zuendfunk/pop-platten/playlisten/index.html"
minimalEpisodeDuration_ms
=
45
*
60
*
1000
showTitle
=
"Zündfunk"
def
download
(
url
:
str
,
attempts
=
4
):
tmpfile
=
NamedTemporaryFile
(
delete
=
False
)
...
...
@@ -32,20 +36,6 @@ def download(url: str, attempts=4):
pass
return
None
if
len
(
sys
.
argv
)
!=
2
:
print
(
"Usage:"
,
file
=
sys
.
stderr
)
print
(
"%s <DownloadDir>
\n
"
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
print
(
"Example:"
,
file
=
sys
.
stderr
)
print
(
"%s 'Downloads/Zündfunk Recordings'
\n
"
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
DESTDIR
=
sys
.
argv
[
1
]
if
not
os
.
path
.
isdir
(
DESTDIR
):
print
(
"Directory %s does not exist!"
%
DESTDIR
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
def
time2seconds
(
timestr
:
str
):
# return duration of HH:MM:SS in seconds
parts
=
re
.
split
(
":"
,
timestr
)
...
...
@@ -71,7 +61,7 @@ def get_playlist_as_text(dt: datetime):
# follow link to playlist
html
=
requests
.
get
(
dayUrl
,
timeout
=
5
).
text
soup
=
BeautifulSoup
(
html
,
'lxml'
)
plsLink
=
soup
.
find
(
'a'
,
class_
=
re
.
compile
(
"^playlist\-\d+$"
),
href
=
re
.
compile
(
'.+playlist\-\d+.html$'
))[
'href'
]
plsLink
=
soup
.
find
(
'a'
,
class_
=
re
.
compile
(
"^playlist
(\-"
+
showTitle
.
lower
()
+
")?
\-\d+$"
),
href
=
re
.
compile
(
'.+playlist
(\-'
+
showTitle
.
lower
()
+
')?
\-\d+.html$'
))[
'href'
]
plsUrl
=
urllib
.
parse
.
urljoin
(
playlistsBaseUrl
,
plsLink
)
# read playlist
...
...
@@ -87,6 +77,20 @@ def get_playlist_as_text(dt: datetime):
return
None
if
len
(
sys
.
argv
)
!=
2
:
print
(
"Usage:"
,
file
=
sys
.
stderr
)
print
(
"%s <DownloadDir>
\n
"
%
sys
.
argv
[
0
],
file
=
sys
.
stderr
)
print
(
"Example:"
,
file
=
sys
.
stderr
)
print
(
"%s 'Downloads/%s Recordings'
\n
"
%
(
sys
.
argv
[
0
],
showTitle
),
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
DESTDIR
=
sys
.
argv
[
1
]
if
not
os
.
path
.
isdir
(
DESTDIR
):
print
(
"Directory %s does not exist!"
%
DESTDIR
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
html
=
requests
.
get
(
baseUrl
,
timeout
=
5
).
text
soup
=
BeautifulSoup
(
html
,
'lxml'
)
...
...
@@ -156,7 +160,7 @@ for bc in broadcastJson['channelBroadcasts']:
# extract metadata from XML with longest audio
XMLmeta
=
{
'topline'
:
safe_text_get
(
xmls
[
0
].
xpath
(
"./audio/topline"
),
0
),
'title'
:
safe_text_get
(
xmls
[
0
].
xpath
(
"./audio/title"
),
0
),
'title'
:
re
.
sub
(
"^Jetzt nachhören: "
,
""
,
safe_text_get
(
xmls
[
0
].
xpath
(
"./audio/title"
),
0
)
)
,
'shareTitle'
:
safe_text_get
(
xmls
[
0
].
xpath
(
"./audio/shareTitle"
),
0
),
'duration'
:
safe_text_get
(
xmls
[
0
].
xpath
(
"./audio/duration"
),
0
),
'channel'
:
safe_text_get
(
xmls
[
0
].
xpath
(
"./audio/channel"
),
0
,
"BAYERN 2"
),
...
...
@@ -184,11 +188,11 @@ for bc in broadcastJson['channelBroadcasts']:
}
## Filter out some episodes
#
I know that a real Zündfunk episode is longer than 45 minutes.
Skip this episode if it is shorter
if
meta
[
'duration_ms'
]
<
45
*
60
*
1000
:
# Skip this episode if it is shorter
than defined minimal duration
if
meta
[
'duration_ms'
]
<
minimalEpisodeDuration_ms
:
continue
# Skip
all non "Zündfunk" broadcasts like "Zündfunk Generator" (which has an official podcast feed)
if
XMLmeta
[
'broadcast'
].
lower
()
!=
'zündfunk'
:
# Skip
this episode if "Broadcast" is not matching the show's title
if
XMLmeta
[
'broadcast'
].
lower
()
!=
showTitle
.
lower
()
:
continue
...
...
@@ -207,7 +211,7 @@ for bc in broadcastJson['channelBroadcasts']:
## Populate values in "meta" dict
# agf_c9 looks like "Zündfunk_Zündfunk_27.08.2020_19:05" or "Zündfunk_Zündfunk Generator_30.08.2020_22:05"
# agf_c9 looks like "Zündfunk_Zündfunk_27.08.2020_19:05" or "Zündfunk_Zündfunk Generator_30.08.2020_22:05"
or "Nachtmix_Nachtmix_27.08.2020_23:05"
# so it can be used to extract the episode's exact broadcast time
try
:
parts
=
XMLmeta
[
'agf_c9'
].
split
(
'_'
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment