1import ebooklib
2import re
3from ebooklib import epub
4
5filenames = ["Convenience Store Woman (Sayaka Murata) (z-lib.org)","The Martian_ A Novel - Weir, Andy"]
6input_folder="./input/"
7output_folder="./output/"
8
9# Code
10for filename in filenames:
11 book = epub.read_epub(input_folder+filename+".epub")
12
13 book_title = book.get_metadata('DC', 'title')[0][0]
14 book_creator = book.get_metadata('DC', 'creator')[0][0]
15
16 print(book_title+", "+book_creator)
17
18 converted_book_content = ""
19
20 for item in book.get_items():
21 if item.get_type() == ebooklib.ITEM_DOCUMENT:
22 converted_book_content+=item.get_body_content().decode("UTF-8")
23
24 converted_book_content = converted_book_content.replace("<p class=\"crt1\">","")
25 converted_book_content = converted_book_content.replace("<p class=\"crt\">","")
26 converted_book_content = converted_book_content.replace("<p class=\"toc\">","")
27 converted_book_content = converted_book_content.replace("<p class=\"indent\">", " ")
28 converted_book_content = converted_book_content.replace("<p class=\"nonindent\">","")
29 converted_book_content = converted_book_content.replace("<p class=\"center\">", " ")
30 converted_book_content = converted_book_content.replace("<p class=\"extract\">", "")
31 converted_book_content = converted_book_content.replace("</p>", "")
32
33 converted_book_content = converted_book_content.replace("<div>", "")
34 converted_book_content = converted_book_content.replace("</div>", "")
35
36 converted_book_content = converted_book_content.replace("&","&")
37 converted_book_content = converted_book_content.replace(" ", "\r")
38 converted_book_content = converted_book_content.replace("<br/>", "\n")
39
40 converted_book_content = converted_book_content.replace("<em>","") # Ignore italics
41 converted_book_content = converted_book_content.replace("</em>","") # Ignore italics
42 converted_book_content = converted_book_content.replace("<strong>","") # Ignore bold text
43 converted_book_content = converted_book_content.replace("</strong>","") # Ignore bold text
44
45 converted_book_content = converted_book_content.replace("<span class=\"small\">","") # Ignore small text
46 converted_book_content = converted_book_content.replace("<span class=\"dropcaps\">","")
47 converted_book_content = converted_book_content.replace("</span>","")
48
49 div = False
50 header_a = False
51 header_b = False
52 header_a_open = False
53 header_b_open = False
54 final_converted_book_content = ""
55 for character_index in range(len(converted_book_content)):
56 # Opening tags
57 if len(converted_book_content)-character_index > 3:
58 if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2] == "<h1":
59 header_a = True
60 header_a_open = True
61 final_converted_book_content += "# "
62
63 if len(converted_book_content)-character_index > 3:
64 if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2] == "<h2":
65 header_b = True
66 header_b_open = True
67 final_converted_book_content += "## "
68
69 if len(converted_book_content)-character_index > 4:
70 if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2]+converted_book_content[character_index+3] == "<div":
71 div = True
72 # Closing tags
73 if len(converted_book_content)-character_index > 4:
74 if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2]+converted_book_content[character_index+3] == "</h1":
75 header_a_open = False
76 if len(converted_book_content)-character_index > 4:
77 if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2]+converted_book_content[character_index+3] == "</h2":
78 header_b_open = False
79 # Check for end of opening tag
80 if converted_book_content[character_index-1] == ">":
81 if header_a:
82 header_a = False
83 if header_b:
84 header_b = False
85 if div:
86 div = False
87 # Remove opening tags
88 if header_a:
89 pass
90 elif header_b:
91 pass
92 elif div:
93 pass
94 else:
95 final_converted_book_content+=converted_book_content[character_index]
96 # Add # to headers
97 if converted_book_content[character_index] == "\n":
98 if header_a_open:
99 final_converted_book_content+="# "
100 if header_b_open:
101 final_converted_book_content+="## "
102
103 # Cleanup unused data
104 final_converted_book_content = final_converted_book_content.replace("</h1>","")
105 final_converted_book_content = final_converted_book_content.replace("</h2>","")
106 final_converted_book_content = final_converted_book_content.replace("</body>","")
107 final_converted_book_content = final_converted_book_content.replace("</a>","")
108 final_converted_book_content = final_converted_book_content.replace("</div>","")
109
110 final_converted_book_content = re.sub("(?s)<body.*?>", "", final_converted_book_content)
111 final_converted_book_content = re.sub("(?s)<p.*?>", "\n", final_converted_book_content)
112 final_converted_book_content = re.sub("(?s)<span.*?>", "", final_converted_book_content)
113 final_converted_book_content = re.sub("(?s)<img.*?/>", "", final_converted_book_content)
114 final_converted_book_content = re.sub("(?s)<a.*?>", "", final_converted_book_content)
115 final_converted_book_content = re.sub("(?s)<div.*?/>", "", final_converted_book_content)
116 final_converted_book_content = re.sub("(?s)<div.*?>", "", final_converted_book_content)
117
118 # Strip out return carriage line breaks
119 final_converted_book_content = final_converted_book_content.replace("\r","")
120
121 # Generate final_save_data
122 final_save_data = ""
123
124 final_save_data += "-+---+-BOOK TITLE-+---+-\n"
125 final_save_data += book_title+"\n"
126 final_save_data += "-+---+-BOOK AUTHOR-+---+-\n"
127 final_save_data += book_creator+"\n"
128 final_save_data += "-+---+-BOOK CONTENT-+---+-\n"
129 final_save_data += final_converted_book_content
130
131 output_file = open(output_folder+filename+".phn","w+")
132 output_file.write(final_save_data)
133 output_file.close()