1import ebooklib
  2import re
  3from ebooklib import epub
  4
  5filenames = ["Convenience Store Woman (Sayaka Murata) (z-lib.org)","The Martian_ A Novel - Weir, Andy"]
  6input_folder="./input/"
  7output_folder="./output/"
  8
  9# Code
 10for filename in filenames:
 11    book = epub.read_epub(input_folder+filename+".epub")
 12
 13    book_title = book.get_metadata('DC', 'title')[0][0]
 14    book_creator = book.get_metadata('DC', 'creator')[0][0]
 15
 16    print(book_title+", "+book_creator)
 17
 18    converted_book_content = ""
 19
 20    for item in book.get_items():
 21        if item.get_type() == ebooklib.ITEM_DOCUMENT:
 22            converted_book_content+=item.get_body_content().decode("UTF-8")
 23
 24    converted_book_content = converted_book_content.replace("<p class=\"crt1\">","")
 25    converted_book_content = converted_book_content.replace("<p class=\"crt\">","")
 26    converted_book_content = converted_book_content.replace("<p class=\"toc\">","")
 27    converted_book_content = converted_book_content.replace("<p class=\"indent\">", "  ")
 28    converted_book_content = converted_book_content.replace("<p class=\"nonindent\">","")
 29    converted_book_content = converted_book_content.replace("<p class=\"center\">", "    ")
 30    converted_book_content = converted_book_content.replace("<p class=\"extract\">", "")
 31    converted_book_content = converted_book_content.replace("</p>", "")
 32
 33    converted_book_content = converted_book_content.replace("<div>", "")
 34    converted_book_content = converted_book_content.replace("</div>", "")
 35
 36    converted_book_content = converted_book_content.replace("&amp;","&")
 37    converted_book_content = converted_book_content.replace("&#13;", "\r")
 38    converted_book_content = converted_book_content.replace("<br/>", "\n")
 39
 40    converted_book_content = converted_book_content.replace("<em>","") # Ignore italics
 41    converted_book_content = converted_book_content.replace("</em>","") # Ignore italics
 42    converted_book_content = converted_book_content.replace("<strong>","") # Ignore bold text
 43    converted_book_content = converted_book_content.replace("</strong>","") # Ignore bold text
 44
 45    converted_book_content = converted_book_content.replace("<span class=\"small\">","") # Ignore small text
 46    converted_book_content = converted_book_content.replace("<span class=\"dropcaps\">","")
 47    converted_book_content = converted_book_content.replace("</span>","")
 48
 49    div = False
 50    header_a = False
 51    header_b = False
 52    header_a_open = False
 53    header_b_open = False
 54    final_converted_book_content = ""
 55    for character_index in range(len(converted_book_content)):
 56        # Opening tags
 57        if len(converted_book_content)-character_index > 3:
 58            if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2] == "<h1":
 59                header_a = True
 60                header_a_open = True
 61                final_converted_book_content += "# "
 62        
 63        if len(converted_book_content)-character_index > 3:
 64            if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2] == "<h2":
 65                header_b = True
 66                header_b_open = True
 67                final_converted_book_content += "## "
 68
 69        if len(converted_book_content)-character_index > 4:
 70            if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2]+converted_book_content[character_index+3] == "<div":
 71                div = True
 72        # Closing tags
 73        if len(converted_book_content)-character_index > 4:
 74            if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2]+converted_book_content[character_index+3] == "</h1":
 75                header_a_open = False
 76        if len(converted_book_content)-character_index > 4:
 77            if converted_book_content[character_index]+converted_book_content[character_index+1]+converted_book_content[character_index+2]+converted_book_content[character_index+3] == "</h2":
 78                header_b_open = False
 79        # Check for end of opening tag
 80        if converted_book_content[character_index-1] == ">":
 81            if header_a:
 82                header_a = False
 83            if header_b:
 84                header_b = False
 85            if div:
 86                div = False
 87        # Remove opening tags
 88        if header_a:
 89            pass
 90        elif header_b:
 91            pass
 92        elif div:
 93            pass
 94        else:
 95            final_converted_book_content+=converted_book_content[character_index]
 96        # Add # to headers
 97        if converted_book_content[character_index] == "\n":
 98            if header_a_open:
 99                final_converted_book_content+="# "
100            if header_b_open:
101                final_converted_book_content+="## "
102    
103    # Cleanup unused data
104    final_converted_book_content = final_converted_book_content.replace("</h1>","")
105    final_converted_book_content = final_converted_book_content.replace("</h2>","")
106    final_converted_book_content = final_converted_book_content.replace("</body>","")
107    final_converted_book_content = final_converted_book_content.replace("</a>","")
108    final_converted_book_content = final_converted_book_content.replace("</div>","")
109
110    final_converted_book_content = re.sub("(?s)<body.*?>", "", final_converted_book_content)
111    final_converted_book_content = re.sub("(?s)<p.*?>", "\n", final_converted_book_content)
112    final_converted_book_content = re.sub("(?s)<span.*?>", "", final_converted_book_content)
113    final_converted_book_content = re.sub("(?s)<img.*?/>", "", final_converted_book_content)
114    final_converted_book_content = re.sub("(?s)<a.*?>", "", final_converted_book_content)
115    final_converted_book_content = re.sub("(?s)<div.*?/>", "", final_converted_book_content)
116    final_converted_book_content = re.sub("(?s)<div.*?>", "", final_converted_book_content)
117
118    # Strip out return carriage line breaks
119    final_converted_book_content = final_converted_book_content.replace("\r","")
120    
121    # Generate final_save_data
122    final_save_data = ""
123
124    final_save_data += "-+---+-BOOK TITLE-+---+-\n"
125    final_save_data += book_title+"\n"
126    final_save_data += "-+---+-BOOK AUTHOR-+---+-\n"
127    final_save_data += book_creator+"\n"
128    final_save_data += "-+---+-BOOK CONTENT-+---+-\n"
129    final_save_data += final_converted_book_content
130
131    output_file = open(output_folder+filename+".phn","w+")
132    output_file.write(final_save_data)
133    output_file.close()