Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Citronalco
solr-calibre-websearch
Commits
a231970a
Commit
a231970a
authored
Aug 20, 2020
by
Bernhard Geier
Browse files
use python3 for indextool
parent
cff9d145
Changes
1
Hide whitespace changes
Inline
Side-by-side
indextool/indexer
View file @
a231970a
#!/usr/bin/env python
#!/usr/bin/env python
3
# -*- coding: utf-8 -*-
import
xml.etree.ElementTree
as
etree
...
...
@@ -31,7 +31,7 @@ def get_metadata(basedir):
if
'metadata.opf'
in
files
:
path
=
'/'
.
join
(
root
.
split
(
'/'
)[
-
2
:])
filename
=
get_ebook_file
(
files
)
extension
=
os
.
path
.
splitext
(
filename
)[
1
].
lower
()[
1
:]
extension
=
os
.
path
.
splitext
(
filename
)[
1
].
lower
()[
1
:]
cover
=
''
if
'cover.jpg'
in
files
:
cover
=
'cover.jpg'
...
...
@@ -45,24 +45,24 @@ def parse_metadata(metadata):
root
=
x
.
getroot
()
def
get_field
(
matcher
):
matches
=
[]
matches
=
[]
for
match
in
root
.
findall
(
'./opf:metadata/dc:%s'
%
matcher
,
namespaces
=
namespaces
):
matches
.
append
(
match
.
text
)
matches
.
append
(
match
.
text
)
return
matches
def
get_meta_field
(
matcher
):
matches
=
[]
matches
=
[]
for
match
in
root
.
findall
(
"./opf:metadata/opf:meta[@name='%s']"
%
matcher
,
namespaces
=
namespaces
):
matches
.
append
(
match
.
get
(
"content"
))
return
matches
matches
.
append
(
match
.
get
(
"content"
))
return
matches
def
get_identifiers
():
matches
=
[]
matches
=
[]
for
match
in
root
.
findall
(
"./opf:metadata/dc:identifier[@opf:scheme]"
,
namespaces
=
namespaces
):
identifier_type
=
match
.
get
(
'{http://www.idpf.org/2007/opf}scheme'
)
if
identifier_type
==
'calibre'
:
continue
matches
.
append
(
identifier_type
+
':'
+
match
.
text
)
identifier_type
=
match
.
get
(
'{http://www.idpf.org/2007/opf}scheme'
)
if
identifier_type
==
'calibre'
:
continue
matches
.
append
(
identifier_type
+
':'
+
match
.
text
)
return
matches
...
...
@@ -79,19 +79,19 @@ def parse_metadata(metadata):
# description may contain html, we remove that
description
=
get_field
(
'description'
)
if
description
:
soup
=
BeautifulSoup
(
description
[
0
],
"lxml"
)
for
i
in
soup
([
'script'
,
'style'
]):
# remove script and style
i
.
extract
()
description
=
soup
.
get_text
()
# get text
lines
=
(
line
.
strip
()
for
line
in
description
.
splitlines
())
# break into lines and remove leading and trailing space on each
chunks
=
(
phrase
.
strip
()
for
line
in
lines
for
phrase
in
line
.
split
(
" "
))
# break multi-headlines into a line each
description
=
'
\n
'
.
join
(
chunk
for
chunk
in
chunks
if
chunk
)
# drop blank lines
soup
=
BeautifulSoup
(
description
[
0
],
"lxml"
)
for
i
in
soup
([
'script'
,
'style'
]):
# remove script and style
i
.
extract
()
description
=
soup
.
get_text
()
# get text
lines
=
(
line
.
strip
()
for
line
in
description
.
splitlines
())
# break into lines and remove leading and trailing space on each
chunks
=
(
phrase
.
strip
()
for
line
in
lines
for
phrase
in
line
.
split
(
" "
))
# break multi-headlines into a line each
description
=
'
\n
'
.
join
(
chunk
for
chunk
in
chunks
if
chunk
)
# drop blank lines
identifier
=
get_identifiers
()
language
=
get_field
(
'language'
)
date
=
get_field
(
'date'
)
if
(
date
[
0
]):
date
=
"%sZ"
%
parse
(
date
[
0
]).
astimezone
(
tzutc
()).
isoformat
()
date
=
"%sZ"
%
parse
(
date
[
0
]).
astimezone
(
tzutc
()).
isoformat
()
publisher
=
get_field
(
'publisher'
)
author_sort
=
get_meta_field
(
'calibre:author_sort'
)
...
...
@@ -121,10 +121,10 @@ def parse_metadata(metadata):
'date'
:
date
,
'year'
:
date
[:
4
],
'publisher'
:
publisher
,
'publisher'
:
publisher
,
'author_sort'
:
author_sort
,
'title_sort'
:
title_sort
,
'author_sort'
:
author_sort
,
'title_sort'
:
title_sort
,
}
...
...
@@ -154,6 +154,7 @@ if __name__ == '__main__':
if
not
ebook_data
:
print
(
"Unable to find metadata in %s."
%
metadata_file
)
continue
ebook_data
.
update
({
'path'
:
path
,
'coverfile'
:
cover
,
'filename'
:
filename
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment