Commit e4074560 authored by citronalco's avatar citronalco
Browse files

add some scrapers

parent bca14fde
website2ics
===========
Perl command line scripts to automatically scrape event information from several web sites and build nice ics calendar files from it.
License
-------
This project is licensed under the terms of the GPLv3 License.
Supported web sites
-------------------
### Eventhalle Westpark, Ingolstadt
[http://www.eventhalle-westpark.de/das-programm]
Script: `eventhalleWestpark2ics.pl`
Demo: [http://www.geierb.de/~geierb/kalender/eventhallewestpark.ics]
### Kulturzentrum Halle Neun, Ingolstadt
[http://halle9-ingolstadt.de]
Script: `halle92ics.pl`
Demo: [http://www.geierb.de/~geierb/kalender/halle9.ics]
### Intro.de
[http://www.intro.de]
Script: `intro2ics.pl`
Demo: [http://www.geierb.de/~geierb/kalender/intro.ics]
### Zündfunk Veranstaltungstipps
[http://www.br.de/radio/bayern2/sendungen/zuendfunk/veranstaltungen-praesentationen/index.html]
Script: `zuendfunk2ics.pl`
Demo: [http://geierb.spdns.de/~geierb/kalender/zuendfunk-tipps.ics]
Usage
-----
Each of the scripts fetches events from a different web site and outputs the ics data on STDOUT, so you might want to pipe the output to a file.
Example:
```bash
$ perl website2ics.pl > calendarfile.ics
````
The only script that supports **and requires** command line arguments is `intro2ics.pl`. You have to give at least one city name for which events should be fetched.
Example:
```bash
$ perl intro2ics.pl hamburg berlin muenchen > calendarfile.ics
````
#!/usr/bin/perl
# 2013 geierb@geierb.de
# GPLv3
use strict;
use WWW::Mechanize;
use HTML::Entities;
use HTML::TreeBuilder;
use DateTime::Format::Strptime;
use Data::ICal;
use Data::ICal::Entry::Event;
use Date::ICal;
use Time::HiRes;
use Try::Tiny;
use utf8;
use Data::Dumper;
use warnings;
my $url="http://www.eventhalle-westpark.de/das-programm";
my $defaultDauer=119; # angenommene Dauer eines Events in Minuten (steht nicht im Programm, wird aber für Kalendereintrag gebraucht)
my $datumFormat=DateTime::Format::Strptime->new('pattern'=>'%d.%m.%Y %H.%M','time_zone'=>'Europe/Berlin');
binmode STDOUT, ":utf8"; # Gegen "wide character"-Warnungen
my $mech=WWW::Mechanize->new();
$mech->get($url) or die($!);
my @eventList;
# alle event-links auslesen..
my @eventLinks=$mech->content()=~/class=\"thumbnail event event-clickable\" onclick=\"showModal\(\d+,\s*\'(http.+?)\'\);\"/g;
# ...und durchgehen
foreach my $eventLink (@eventLinks) {
my $event;
my $ok=eval { $mech->get($eventLink); }; # sometimes some links are broken
next unless ($ok);
my ($name,$genre,$datum,$kurztext,$einlass,$beginn,$vorverkauf,$abendkasse,$langtext,$ort,$veranstalter);
my $root=HTML::TreeBuilder->new_from_content($mech->content());
# Name
$name=($root->look_down('_tag'=>'h4','class'=>'modal-title'))->as_trimmed_text;
$event->{'name'}=$name;
my $tree=$root->look_down('_tag'=>'table','class'=>'table table-condensed detail-table');
# Datum
$datum=($tree->look_down('_tag'=>'td',
sub {
$_[0]->as_text=~/Datum:/
}
)->right)->as_trimmed_text;
# Einlass und Beginn stehen in einer Zeile
my $einlassBeginn=($tree->look_down('_tag'=>'td',
sub {
$_[0]->as_text=~/Einlass:/
}
)->right)->as_trimmed_text;
($event->{'einlass'})=$einlassBeginn=~/^(\d+\.\d+) Uhr/;
($event->{'beginn'})=$einlassBeginn=~/(\d+\.\d+) Uhr$/;
$event->{'einlass'}=$datumFormat->parse_datetime($datum." ".$event->{'einlass'});
$event->{'beginn'}=$datumFormat->parse_datetime($datum." ".$event->{'beginn'});
# Ende=Beginn+$defaultDauer
$event->{'ende'}=$event->{'beginn'}->clone();
$event->{'ende'}->add(minutes=>$defaultDauer);
# Preise für Vorverkauf und Abendkasse stehen in einer Zeile
try {
my $ticketPreis=($tree->look_down('_tag'=>'td',
sub {
$_[0]->as_text=~/Ticket Preis:/
}
)->right)->as_trimmed_text;
($event->{'vorverkauf'})=$ticketPreis=~/^([\d\.\,]+ €)/;
($event->{'abendkasse'})=$ticketPreis=~/Abendkasse: ([\d\.,]+ €)/;
};
# Der Ort fehlt ab und zu
try {
$event->{'ort'}=($tree->look_down('_tag'=>'td',
sub {
$_[0]->as_text=~/Location:/
}
)->right)->as_trimmed_text;
}
catch {
try {
my $locationAlert=($tree->look_down('_tag'=>'td','class'=>'location-alert'))->as_trimmed_text;
($event->{'ort'})=$locationAlert=~/Die Veranstaltung findet in folgender Location statt: (.+$)/;
}
};
# Beschreibung
try {
$event->{'description'}=($root->look_down('_tag'=>'div','id'=>'beschreibung'))->as_text;
};
# Genre
$event->{'genre'}=($tree->look_down('_tag'=>'td',
sub {
$_[0]->as_text=~/Stil:/
}
)->right)->as_trimmed_text;
# Veranstalter
$event->{'veranstalter'}=($tree->look_down('_tag'=>'td',
sub {
$_[0]->as_text=~/Veranstalter:/
}
)->right)->as_trimmed_text;
# URL
$event->{'url'}=($mech->uri())->as_string;
push (@eventList,$event);
# print Dumper $event;
}
# Create Datestamp for dtstamp
my @stamp=localtime;
my $dstamp = sprintf("%d%02d%02dT%02d%02d%02dZ",
$stamp[5] + 1900,
$stamp[4] + 1,
$stamp[3],
$stamp[2],
$stamp[1],
$stamp[0]);
my $calendar=Data::ICal->new();
$calendar->add_properties(method=>"PUBLISH",
"X-PUBLISHED-TTL"=>"P1D",
"X-WR-CALNAME"=>"Eventhalle Westpark",
"X-WR-CALDESC"=>"Veranstaltungen Eventhalle Westpark");
my $count=0;
foreach my $event (@eventList) {
# Create uid
my @tm=localtime();
my $uid=sprintf("%d%02d%02d%02d%02d%02d%s%02d\@geierb.de",
$tm[5] + 1900, $tm[4] + 1, $tm[3], $tm[2],
$tm[1], $tm[0], scalar(Time::HiRes::gettimeofday()), $count);
my $description;
$description.="Genre: ".$event->{'genre'}." \n";
if ($event->{'vorverkauf'}) { $description.="Vorverkauf: ".$event->{'vorverkauf'}." \n"; }
if ($event->{'abendkasse'}) { $description.="Abendkasse: ".$event->{'abendkasse'}." \n"; }
$description.="Einlass: ".sprintf("%02d:%02d Uhr",$event->{'einlass'}->hour,$event->{'einlass'}->minute);
$description.=" \n\n".$event->{'description'};
my $eventEntry=Data::ICal::Entry::Event->new();
$eventEntry->add_properties(
uid=>$uid,
categories=>$event->{'genre'},
summary => $event->{'name'},
description => $description,
dtstart=>Date::ICal->new(
year=>$event->{'beginn'}->year,
month=>$event->{'beginn'}->month,
day=>$event->{'beginn'}->day,
hour=>$event->{'beginn'}->hour,
min=>$event->{'beginn'}->min,
sec=>0
)->ical,
#duration=>"PT3H",
dtend=>Date::ICal->new(
year=>$event->{'ende'}->year,
month=>$event->{'ende'}->month,
day=>$event->{'ende'}->day,
hour=>$event->{'ende'}->hour,
min=>$event->{'ende'}->min,
sec=>0
)->ical,
dtstamp=>$dstamp,
class=>"PUBLIC",
organizer=>$event->{'veranstalter'},
location=>$event->{'ort'},
url=>$event->{'url'},
);
$calendar->add_entry($eventEntry);
$count++;
}
print $calendar->as_string;
#!/usr/bin/perl
# 2015 geierb@geierb.de
# GPLv3
use strict;
use WWW::Mechanize;
use HTML::Entities;
use HTML::TreeBuilder;
use DateTime::Format::Strptime;
use Data::ICal;
use Data::ICal::Entry::Event;
use Date::ICal;
use Time::HiRes;
use Try::Tiny;
use utf8;
use Data::Dumper;
use warnings;
my $defaultDauer=119; # angenommene Dauer eines Events in Minuten (steht nicht im Programm, wird aber für Kalendereintrag gebraucht)
my $url="http://www.neun-ingolstadt.de/programm/";
my $datumFormat=DateTime::Format::Strptime->new('pattern'=>'%d-%m-%Y %H:%M','time_zone'=>'Europe/Berlin');
binmode STDOUT, ":utf8"; # Gegen "wide character"-Warnungen
my $mech=WWW::Mechanize->new();
$mech->get($url) or die($!);
# alle links aus dem Kalender auslesen
my @eventLinks=$mech->find_all_links(text=>'MEHR INFORMATIONEN');
my @events;
foreach my $eventLink (@eventLinks) {
my $event;
$mech->get($eventLink);
my $root=HTML::TreeBuilder->new_from_content($mech->content());
my $tree=$root->look_down('_tag'=>'div','id'=>'content');
#### linke Spalte
my $h=$tree->look_down('_tag'=>'div',class=>'programmMeta') or die();
# Datum
($event->{'datum'})=$h->as_trimmed_text()=~/^(\d{2}-\d{2}-\d{4})/;
# Einlass
if (my ($einlass)=$h->as_trimmed_text()=~/Einlass (\d{2}:\d{2}) Uhr/) {
if (($einlass) and ($einlass=~/24:00/)) {
$event->{'einlass'}=$datumFormat->parse_datetime($event->{'datum'}." 00:00 Uhr");
$event->{'einlass'}->add(days=>1);
}
else {
$event->{'einlass'}=$datumFormat->parse_datetime($event->{'datum'}." ".$einlass);
}
}
# Beginn
if (my ($beginn)=$h->as_trimmed_text()=~/Beginn (\d{2}:\d{2}) Uhr/) {
if (($beginn) and ($beginn=~/24:00/)) {
$event->{'beginn'}=$datumFormat->parse_datetime($event->{'datum'}." 00:00 Uhr");
$event->{'beginn'}->add(days=>1);
}
else {
$event->{'beginn'}=$datumFormat->parse_datetime($event->{'datum'}." ".$beginn);
}
}
#### rechte Spalte
$h=$root->look_down('class'=>'articleContent') or die();
# Name
$event->{'name'}=$h->look_down('_tag'=>'h2')->as_trimmed_text();
# beschreibung
($event->{'description'})=$h->as_trimmed_text();
$event->{'description'}=~s/^\s*$event->{'name'}\s*$//; # todo: name ist immer noch in der beschtreibung!!! besser führendes <h2>..</h2> raus!
# URL
$event->{'url'}= ($mech->uri())->as_string;
# Ort
$event->{'ort'}= "Kulturzentrum neun, Elisabethstr. 9a, 85051 Ingolstadt";
# Prüfen ob alle nötig Infos da
# unless ($event->{'datum'} && $event->{'name'} && $event->{'description'} && $event->{'datum'}) {
# print STDERR Dumper $event;
# exit;
# }
# print STDERR Dumper $event;
push(@events,$event);
}
# Create Datestamp for dtstamp
my @stamp=localtime;
my $dstamp = sprintf("%d%02d%02dT%02d%02d%02dZ",
$stamp[5] + 1900,
$stamp[4] + 1,
$stamp[3],
$stamp[2],
$stamp[1],
$stamp[0]);
my $calendar=Data::ICal->new();
$calendar->add_properties(method=>"PUBLISH",
"X-PUBLISHED-TTL"=>"P1D",
"X-WR-CALNAME"=>"Kulturzentrum neun",
"X-WR-CALDESC"=>"Veranstaltungen Kulturzentrum neun");
my $count=0;
foreach my $event (@events) {
# if ($event->{'einlass'}) { print STDERR $event->{'name'}.": ".$event->{'einlass'}."\n"; }
# Create uid
my @tm=localtime();
my $uid=sprintf("%d%02d%02d%02d%02d%02d%s%02d\@geierb.de",
$tm[5] + 1900, $tm[4] + 1, $tm[3], $tm[2],
$tm[1], $tm[0], scalar(Time::HiRes::gettimeofday()), $count);
$event->{'datum'}=~/(\d\d)\-(\d\d)\-(\d\d\d\d)/;
# wenn weder beginn noch einlass gegeben ist ganztages-event bauen
my $startTime="$3$2$1";
my $endTime=$startTime;
if ($event->{'beginn'}) {
$startTime=Date::ICal->new(
year=>$event->{'beginn'}->year,
month=>$event->{'beginn'}->month,
day=>$event->{'beginn'}->day,
hour=>$event->{'beginn'}->hour,
min=>$event->{'beginn'}->min,
sec=>0
)->ical;
$event->{'description'}="Beginn: ".$event->{'beginn'}->hour.":".$event->{'beginn'}->min." Uhr ".$event->{'description'};
$event->{'ende'}=$event->{'beginn'}->clone();
$event->{'ende'}->add(minutes=>$defaultDauer);
}
elsif ($event->{'einlass'}) {
$startTime=Date::ICal->new(
year=>$event->{'einlass'}->year,
month=>$event->{'einlass'}->month,
day=>$event->{'einlass'}->day,
hour=>$event->{'einlass'}->hour,
min=>$event->{'einlass'}->min,
sec=>0
)->ical;
$event->{'description'}="Einlass: ".$event->{'einlass'}->hour.":".$event->{'einlass'}->min." Uhr ".$event->{'description'};
$event->{'ende'}=$event->{'einlass'}->clone();
$event->{'ende'}->add(minutes=>$defaultDauer);
}
if ($event->{'ende'}) {
$endTime=Date::ICal->new(
year=>$event->{'ende'}->year,
month=>$event->{'ende'}->month,
day=>$event->{'ende'}->day,
hour=>$event->{'ende'}->hour,
min=>$event->{'ende'}->min,
sec=>0
)->ical;
}
my $eventEntry=Data::ICal::Entry::Event->new();
$eventEntry->add_properties(
uid=>$uid,
summary => $event->{'name'},
description => $event->{'description'},
# dtstart=>$startTime->ical,
# dtend=>$endTime->ical,
dtstart=>$startTime,
dtend=>$endTime,
# duration=>"PT3H",
dtstamp=>$dstamp,
class=>"PUBLIC",
location=>$event->{'ort'},
url=>$event->{'url'},
);
$calendar->add_entry($eventEntry);
$count++;
}
print $calendar->as_string;
#!/usr/bin/perl
# 2014 geierb@geierb.de
# GPLv3
use strict;
use WWW::Mechanize;
use HTML::Entities;
use HTML::Strip;
use HTML::TreeBuilder;
use DateTime::Format::Strptime;
use Data::ICal;
use Data::ICal::Entry::Event;
use Date::ICal;
use Time::HiRes;
use Try::Tiny;
use utf8;
use open qw(:std :utf8);
use Data::Dumper;
use warnings;
#my @cities=("ingolstadt","muenchen","nuernberg");
my @cities=@ARGV;
if (@cities==0) {
print "Give me city names for which to look for upcoming events.\n";
print "Example: ".$0." ingolstadt muenchen nuernberg\n\n";
exit 1;
}
binmode STDOUT, ":utf8"; # Gegen "wide character"-Warnungen
my $mech=WWW::Mechanize->new();
my $hs=HTML::Strip->new();
my @events;
foreach my $city (@cities) {
my $url="http://www.intro.de/termine/".lc($city)."/seite/";
my $seite=1;
do {
$mech->get($url.$seite++) or die($!);
my $root=HTML::TreeBuilder->new_from_content($mech->content());
foreach my $day ($root->look_down('_tag'=>'div','class'=>'date item')) {
foreach my $event ($day->look_down('_tag'=>'li')) {
my $eventData;
my $moreLink=$event->look_down('_tag'=>'a')->attr('href');
$moreLink=~/.+?\/.+?\/(\d{4})-(\d{2})-(\d{2})\//;
$eventData->{'beginn'}=DateTime->new(year=>$1,month=>$2,day=>$3);
$eventData->{'ende'}=$eventData->{'beginn'}->clone();
$eventData->{'ende'}->add(days=>1);
$eventData->{'name'}=$event->look_down('_tag'=>'p','class'=>'artist')->as_text;
($eventData->{'location'})=$event->look_down('_tag'=>'p','class'=>'name')->as_text=~/^\s*(.*?)\s*$/;
$mech->follow_link('url'=>$moreLink);
my $root2=HTML::TreeBuilder->new_from_content($mech->content());
my @moreInfo=split(/<br\s*\/><br\s*\/>/,$root2->look_down('_tag'=>'p','id'=>'first-paragraph')->as_HTML);
foreach my $currentMoreInfo (@moreInfo) {
my $currentMoreInfoText=$hs->parse($currentMoreInfo);
$currentMoreInfoText=~s/^\s*//;
$currentMoreInfoText=~s/\s*$//;
if ($currentMoreInfoText=~/^Location/) {
my @loc=$currentMoreInfoText=~/^Location:\s+($eventData->{'location'})\s*(.*?)$/;
$eventData->{'location'}=join(", ",@loc);
}
elsif ($currentMoreInfoText=~/^$eventData->{'location'}$/) {
next;
}
else {
$eventData->{'description'}= $currentMoreInfoText;
}
}
$mech->back();
push(@events,$eventData);
}
}
} while($mech->find_link('class'=>'arrow next'));
}
# Create Datestamp for dtstamp
my @stamp=localtime;
my $dstamp = sprintf("%d%02d%02dT%02d%02d%02dZ",
$stamp[5] + 1900,
$stamp[4] + 1,
$stamp[3],
$stamp[2],
$stamp[1],
$stamp[0]);
my $calendar=Data::ICal->new();
$calendar->add_properties(method=>"PUBLISH",
"X-PUBLISHED-TTL"=>"P1D",
"X-WR-CALNAME"=>"Intro.de",
"X-WR-CALDESC"=>"Veranstaltungen in ".join(", ",map{ucfirst lc} @cities));
my $count=0;
foreach my $event (@events) {
# Create uid
my @tm=localtime();
my $uid=sprintf("%d%02d%02d%02d%02d%02d%s%02d\@geierb.de",
$tm[5] + 1900, $tm[4] + 1, $tm[3], $tm[2],
$tm[1], $tm[0], scalar(Time::HiRes::gettimeofday()), $count);
my $eventEntry=Data::ICal::Entry::Event->new();
$eventEntry->add_properties(
uid=>$uid,
summary => $event->{'name'},
description => $event->{'description'},
dtstart=>sprintf("%04d%02d%02d",$event->{'beginn'}->year,$event->{'beginn'}->month,$event->{'beginn'}->day),
dtend=>sprintf("%04d%02d%02d",$event->{'ende'}->year,$event->{'ende'}->month,$event->{'ende'}->day),
# all_day=>'1',
# duration=>"PT3H",
dtstamp=>$dstamp,
class=>"PUBLIC",
# organizer=>$event->{'veranstalter'},
location=>$event->{'location'},
# url=>$event->{'url'},
);
$calendar->add_entry($eventEntry);
$count++;
}
print $calendar->as_string;
#!/usr/bin/perl
# 2013 geierb@geierb.de
# GPLv3
use strict;
use WWW::Mechanize;
use HTML::Entities;
use HTML::TreeBuilder;
use DateTime::Format::Strptime;
use Data::ICal;
use Data::ICal::Entry::Event;
use Date::ICal;
use Time::HiRes;
use Try::Tiny;
use Data::Dumper;
use utf8;
use warnings;
my $defaultDauer=119; # angenommene Dauer eines Events in Minuten (steht nicht im Programm, wird aber für Kalendereintrag gebraucht)
my $url="http://www.br.de/radio/bayern2/sendungen/zuendfunk/veranstaltungen-praesentationen/index.html";
# Gegen "wide character"-Warnungen
binmode STDOUT, ":utf8";
my $datumZeitFormat=DateTime::Format::Strptime->new('pattern'=>'%A, %d. %B %Y, %H:%M Uhr','time_zone'=>'Europe/Berlin','locale'=>'de_DE');
my $mech=WWW::Mechanize->new();
$mech->get($url) or die($!);
$mech->follow_link(text=>'Alle Termine des Monats');
my $tree=HTML::TreeBuilder->new_from_content($mech->content());
my @eventTrees=$tree->look_down('_tag'=>'div','class'=>'calendar_content');