!wget https://raw.githubusercontent.com/PerseusDL/canonical-greekLit/master/data/tlg0059/tlg011/tlg0059.tlg011.perseus-eng2.xml
!wget https://raw.githubusercontent.com/PerseusDL/canonical-greekLit/master/data/tlg0059/tlg011/tlg0059.tlg011.perseus-grc2.xml
from bs4 import BeautifulSoup
with open("tlg0059.tlg011.perseus-eng2.xml") as fp:
soup = BeautifulSoup(fp, 'lxml')
all_divs = soup.find_all('div', {"subtype" : "section"})
for div in all_divs:
print(div['n'])
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 2:
who1 = said[0]['who']
who2 = said[1]['who']
print(n, who1, who2)
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
print(n, who)
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text = div.find_all(text = True)
print(n, who)
print(text)
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = div.find_all(text = True)
said = said[0]
text_said = said.find_all(text = True)
print(n, who)
print(text_div)
print(text_said)
print()
texts_div = []
texts_said = []
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = div.find_all(text = True)
texts_div.append(text_div)
said = said[0]
text_said = said.find_all(text = True)
texts_said.append(text_said)
print(texts_div[0])
print(texts_said[0])
import re
my_str = str(texts_div[0])
result1 = re.sub(r"'\\n', ", "", my_str)
result2 = re.sub(r"'\\n\s+", "", result1)
print(my_str)
print(type(my_str))
print(result1)
print(result2)
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = re.sub(r"'\\n\s+|'\\n', ", "", str(div.find_all(text = True)))
said = said[0]
text_said = re.sub(r"'\\n\s+|'\\n', ", "", str(said.find_all(text = True)))
print(n, text_div == text_said)
with open("tlg0059.tlg011.perseus-grc2.xml") as fp:
greek_soup = BeautifulSoup(fp, 'lxml')
f = open('divs.xml', 'w')
all_divs = greek_soup.find_all('div', {"subtype" : "section"})
for div in all_divs:
n = div['n']
str_div = str(div)
if n in mono:
str_div = str_div.replace('</said>', '</said></said>')
str_div = str_div.replace('<said rend="merge" who="#Ἀπολλόδωρος">', '<said who="#Ἀπολλόδωρος" rend="merge"><said who="{}" direct="false">'.format(mono[n]))
f.write(str_div)
f.close()
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = re.sub(r"'\\n\s+|'\\n', ", "", str(div.find_all(text = True)))
said = said[0]
text_said = re.sub(r"'\\n\s+|'\\n', ", "", str(said.find_all(text = True)))
if text_div != text_said:
str_div = str(div)
str_div = re.sub(r">[\\n \\t\\r\\s]+<", "><", str_div, re.MULTILINE)
print(str_div)
print()
import re
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = re.sub(r"'\\n\s+|'\\n', ", "", str(div.find_all(text=True)))
said = said[0]
text_said = re.sub(r"'\\n\s+|'\\n', ", "", str(said.find_all(text=True)))
if text_div != text_said:
str_div = str(div)
str_div = str_div.replace("\n", "")
str_div = re.sub(r">[\\n \\t\\r\\s]+<", "><", str_div, flags=re.MULTILINE)
print(str_div)
print()
speaker = {}
import re
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = re.sub(r"'\\n\s+|'\\n', ", "", str(div.find_all(text=True)))
said = said[0]
text_said = re.sub(r"'\\n\s+|'\\n', ", "", str(said.find_all(text=True)))
if text_div != text_said:
str_div = str(div)
str_div = str_div.replace("\n", "")
str_div = re.sub(r">[\\n \\t\\r\\s]+<", "><", str_div, flags=re.MULTILINE)
print(n, who)
print()
speaker = {}
import re
all_divs = soup.find_all('div', {"subtype": "section"})
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = re.sub(r"'\\n\s+|'\\n', ", "", str(div.find_all(text=True)))
said = said[0]
text_said = re.sub(r"'\\n\s+|'\\n', ", "", str(said.find_all(text=True)))
if text_div != text_said:
str_div = str(div)
str_div = str_div.replace("\n", "")
str_div = re.sub(r">[\\n \\t\\r\\s]+<", "><", str_div, flags=re.MULTILINE)
if '<said rend="merge" who="#Apollodorus"><said' in str_div:
speaker[n] = who
list_milestones = re.findall('(</said>)?<milestone ed="P"', str_div)
print(list_milestones)
print(n)
print(list_milestones.index('</said>'))
print()
f = open('divs2.xml', 'w')
all_divs = greek_soup.find_all('div', {"subtype" : "section"})
for div in all_divs:
n = div['n']
str_div = str(div)
if n in speaker:
#str_div = str_div.replace('</said>', '</said></said>')
str_div = str_div.replace('<said rend="merge" who="#Ἀπολλόδωρος">', '<said who="#Ἀπολλόδωρος" rend="merge"><said who="{}" direct="false">'.format(speaker[n]))
f.write(str_div)
f.close()
#Импортирование необходимых библиотек
import os
import re
from bs4 import BeautifulSoup
#Загрузка и сохранение XML-файлов
!wget https://raw.githubusercontent.com/PerseusDL/canonical-greekLit/master/data/tlg0059/tlg011/tlg0059.tlg011.perseus-eng2.xml
!wget https://raw.githubusercontent.com/PerseusDL/canonical-greekLit/master/data/tlg0059/tlg011/tlg0059.tlg011.perseus-grc2.xml
#Загрузка XML-файлов
with open('tlg0059.tlg011.perseus-eng2.xml', 'r') as file:
data_eng = file.read()
with open('tlg0059.tlg011.perseus-grc2.xml', 'r') as file:
data_greek = file.read()
#Парсинг XML-файлов с помощью BeautifulSoup
eng_soup = BeautifulSoup(data_eng, 'xml')
greek_soup = BeautifulSoup(data_greek, 'xml')
#Создаём словарь для хранения информации о говорящем
speaker = {}
#Находим все div-элементы с подтипом "section" в греческом XML
all_divs = greek_soup.find_all('div', {"subtype": "section"})
#Пройдём по каждому div-элементу и извлечём информацию о говорящем
for div in all_divs:
n = div['n']
said = div.find_all('said', {"direct": "false"})
if len(said) == 1:
who = said[0]['who']
text_div = re.sub(r"'\n\s+|'\n', ", "", str(div.find_all(text=True)))
said = said[0]
text_said = re.sub(r"'\n\s+|'\n', ", "", str(said.find_all(text=True)))
if text_div != text_said:
str_div = str(div)
str_div = str_div.replace("\n", "")
str_div = re.sub(r">[\n \t\r\s]+<", "><", str_div, flags=re.MULTILINE)
if '<said rend="merge" who="#Apollodorus"><said' in str_div:
speaker[n] = who
list_milestones = re.findall('(</said>)?<milestone ed="P"', str_div)
print(list_milestones)
print(n)
print(list_milestones.index('</said>'))
print()
#Откроем новый файл для записи измененного XML на древнегреческом
with open('divs2.xml', 'w') as file:
all_divs = greek_soup.find_all('div', {"subtype" : "section"})
for div in all_divs:
n = div['n']
str_div = str(div)
if n in speaker:
str_div = str_div.replace('<said rend="merge" who="#Ἀπολλόδωρος">', '<said who="#Ἀπολλόδωρος" rend="merge"><said who="{}" direct="false">'.format(speaker[n]))
file.write(str_div)
#Выводим сообщение о том, что файл был записан
print('success')
from bs4 import BeautifulSoup
# Load the XML file
with open("symposium_ed.xml", "r") as file:
xml_data = file.read()
# Parse the XML
soup = BeautifulSoup(xml_data, "xml")
# Find all <said> tags
said_tags = soup.find_all("said")
# Extract quotes and names
for said_tag in said_tags:
name = said_tag["who"]
quote = said_tag.get_text(strip=True)
print(f"{name}:")
print(quote)