<a href="https://colab.research.google.com/github/lucasgneccoh/BDSS_Dauphine/blob/main/notebooks/solutions/BDSS_TD2_XML_DOM_solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bases de données semi-structurées - TD 2 - XML and DOM in Python

Main teacher: **Dario COLAZZO**

Teaching Assistant: **Lucas GNECCO**

Special thanks to **Beatrice NAPOLITANO**

Université Paris Dauphine - PSL

# TODO:

    - Make the notebook self contained, bring content from the pdf

# Introduction

Welcome!

In this notebook we will practice XML using the DOM implementation in Python. This will allow us to explore an XML graph using Python objects and expressions, which should make things a lot easier!


Here is some important documentation and resources

https://docs.python.org/3/library/xml.dom.html

https://docs.python.org/3/library/xml.dom.minidom.html

# Preambule

In [None]:
from lxml import etree
import re
from xml.dom.minidom import parse

# Functions to work with XML files

def validate_xml(xml_path:str, dtd_path:str) -> bool:
    ''' Validate an XML file  against a DTD using the lxml library
    '''
    try:
        dtd = etree.DTD(open(dtd_path))
    except etree.DTDParseError as ed:
        print(f"DTDParseError: {ed}")
        for i, er in enumerate(ed.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    try:
        xml_doc = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}")
        for i, er in enumerate(e.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    result = dtd.validate(xml_doc)
    if not result: print(dtd.error_log[0])

    return result

def write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers = None):
    ''' Write a list of strings into files. This strings should be XML and DTD files
    '''

    # If single strings are given, encapsulate them in lists  
    if all(map(lambda o: isinstance(o, str), [xml_strings, dtd_strings])):
        xml_strings, dtd_strings = [xml_strings], [dtd_strings]

    if len(xml_strings) != len(dtd_strings):
        raise Exception("Different number of XML and DTD strings!")

    # If no identifiers are given, create default ones. This determines file names
    if identifiers is None:
        identifiers = [f"file_{i}" for i in range(len(xml_strings))]

    try:
        for x, d, id in zip(xml_strings, dtd_strings, identifiers):
            xml_path, dtd_path = f"{id}.xml", f"{id}.dtd" 
            with open(xml_path,"w") as f:
                f.write(x)
            with open(dtd_path,"w") as f:
                f.write(d)
    except Exception as e:
        print("Problems while writing XML and DTD files")
        raise e

    return identifiers



def test_validation(xml_string, dtd_string, validator):
    ''' Validate an XML document against a DTD, both given as strings
    '''
    # Write files
    write_xml_dtd_files_from_strings(xml_string, dtd_string, identifiers = ['temp'])
    
    # Validate
    return validator("temp.xml", "temp.dtd" )

def xpath_query_xml_string(xml_string, query_string):
    xml_path = "xml_doc.xml"
    with open(xml_path, "w") as f:
        # Remove all whitespaces to keep the 'real' text of each node
        f.write(re.sub(">[\s|\n]*<", "><", xml_string))
        f.close()
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)

def xpath_query_xml_file(xml_path, query_string):
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)


def print_xpath_query_results(results):
    print(f"Total results: {len(results)}")
    print("*"*20 + "\n")
    for e in results:
        try:        
            print(f"node tag: {e.tag}")
            print(f"node text: *{e.text}*")
            print(', '.join([f"{k} = {v}"for k, v in e.items()]))
            print("-"*20)
        except:
            print("--Except")
            print(e)

# Examples with DOM

In [None]:
xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE address SYSTEM "dtddoc.dtd">
	<carnet>
		<address name="Beatrice Napolitano" id="_1">
			<company>Paris-Dauphine</company>
			<phone>06 12345678</phone>
		</address>
		<address id="_2">
			<company>Paris-Dauphine</company>
			<phone>06 99999999</phone>
		</address>
	</carnet>'''

write_xml_dtd_files_from_strings(xml_string, "", identifiers = ["carnet"])

def example_getId():
	dom = parse("carnet.xml")
	print(dom.hasChildNodes())
	for n in dom.getElementsByTagName("address"):
		if (n.hasAttribute("name")):
			print(n.getAttribute("id"))
   

example_getId()

# Ex 1 to 5
We will do like we did on Ex 1 from the last TD.
Write your DTD and XML files, and validate them using the given functions

In [None]:
#dtddoc.dtd
dtd_string = \
'''
DTD file goes here
'''

# xmldoc.xml.
# XML document is correct !

xml_string = \
'''
XML file goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))

# Ex 6
Now the idea is to practice the queries but using this new tool.

We will use the same sample files from the last TD

Remember here is the documentation for DOM: https://docs.python.org/3/library/xml.dom.html

In [None]:
dtd_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_students/main/data/films.dtd"
xml_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_students/main/data/films.xml"

!rm "./films.dtd"
!rm "./films.xml"

# Download the imdb sample file
!wget {dtd_link}
!wget {xml_link}

# If the download fails, you will have to load the files into the Colab session. 
# Go to the Files section on the left panel

if validate_xml("films.xml", "films.dtd"):
    print("Files were downloaded correctly")

In [None]:
dom = parse("films.xml")

# I write this function to simplify getting the value of a node that only contains text
def getText(node):
    return node.childNodes[0].data

# ----------------------------------------------------------------------------
# Query 1
# La liste des titres de films.      

def dom_query_1_1(dom):
    titre=[]
    for t in dom.getElementsByTagName("TITRE"):
        titre.append(t.childNodes[0].data)
    return titre

def dom_query_1_2(dom):
    titles = []
    for f in dom.getElementsByTagName("FILM"):
        for t in f.getElementsByTagName("TITRE"):
            titles.append(getText(t))
    return titles

ans = dom_query_1_1(dom)
print("1.1\t", ans)

ans = dom_query_1_2(dom)
print("1.2\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 2
# Les titres des films parus en 1980.

def dom_query_2_1(dom, annee):
    titre=[]
    for f in dom.getElementsByTagName("FILM"):
        if int(f.getAttribute("Annee")) == annee :
            titre.append(f.childNodes[0].childNodes[0].data)
    return titre

def dom_query_2_2(dom, annee):
    titles = []
    for f in dom.getElementsByTagName("FILM"):
        if f.hasAttribute("Annee") and int(f.getAttribute("Annee")) == annee:
            for t in f.getElementsByTagName("TITRE"):
                titles.append(getText(t))
    return titles


ans = dom_query_2_1(dom, 1980)
print("2.1\t", ans)

ans = dom_query_2_2(dom, 1980)
print("2.2\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 3
# Le résumé d'Alien.

def dom_query_3_1(dom, title):
    filmNodes = dom.getElementsByTagName("FILM")
    for f in filmNodes:
        if f.getElementsByTagName("TITRE")[0].childNodes[0].data == title :
            res = f.getElementsByTagName("RESUME")
            if not res.length == 0:
                return res[0].childNodes[0].data
    return "No resume"

def dom_query_3_2(dom, title):
    for t in dom.getElementsByTagName("TITRE"):
        if t.childNodes[0].data == title:
            for k in t.parentNode.childNodes:
                if type(k)==type(t) :
                    if k.tagName=="RESUME":
                        return k.childNodes[0].data
    return "No resume"

def dom_query_3_3(dom, title):
    resume = []
    for f in dom.getElementsByTagName("FILM"):
        for t in f.getElementsByTagName("TITRE"):
            if getText(t) == title:
                # Go into the RESUME node
                for r in f.getElementsByTagName("RESUME"):
                    resume.append(getText(r))
    return resume

ans = dom_query_3_1(dom, "Alien")
print("3.1\t", ans)

ans = dom_query_3_2(dom, "Alien")
print("3.2\t",ans)

ans = dom_query_3_3(dom, "Alien")
print("3.3\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 4
# Les titre des films avec Bruce Willis.

def dom_query_4_1 (dom, nom, prenom):
    titre=[]
    for t in dom.getElementsByTagName("ROLE"):
        if t.childNodes[1].childNodes[0].data == nom and t.childNodes[0].childNodes[0].data == prenom:
            titre.append(t.parentNode.parentNode.childNodes[0].childNodes[0].data)
    return titre

def dom_query_4_2(dom, nom, prenom):
    ListTitre = []
    for f in dom.getElementsByTagName("FILM"):
        for r in f.getElementsByTagName("ROLE"):
            if (r.getElementsByTagName('PRENOM')[0].childNodes[0].data == prenom) and (r.getElementsByTagName('NOM')[0].childNodes[0].data == nom):
                ListTitre.append(f.getElementsByTagName("TITRE")[0].childNodes[0].data)
    return ListTitre

def dom_query_4_3(dom, nom, prenom):
    titles = []
    for f in dom.getElementsByTagName("FILM"):
        for roles in f.getElementsByTagName("ROLES"):
            for r in roles.getElementsByTagName("ROLE"):
                prenom_role = r.getElementsByTagName("PRENOM")[0]
                nom_role = r.getElementsByTagName("NOM")[0]
                if getText(prenom_role) == prenom and getText(nom_role) == nom:
                    t = f.getElementsByTagName("TITRE")[0]
                    titles.append(getText(t))
    return titles

ans = dom_query_4_1(dom, "Willis", "Bruce")
print("4.1\t", ans)

ans = dom_query_4_2(dom, "Willis", "Bruce")
print("4.2\t",ans)

ans = dom_query_4_3(dom, "Willis", "Bruce")
print("4.3\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 5
# Les titres des films qui ont un résumé.


def dom_query_5_1(dom):
    l = []
    for n in dom.getElementsByTagName("FILM"):
        if n.getElementsByTagName("RESUME").length != 0:
            l.append(n.childNodes[0].childNodes[0].data)
    return l

def dom_query_5_2(dom):
    titre = []
    for t in dom.getElementsByTagName("FILM"):
        for k in t.childNodes:
            if type(k) == type(t):
                if k.tagName == "RESUME":
                    titre.append(k.parentNode.childNodes[0].childNodes[0].data)
    return titre

def dom_query_5_3(dom):
    titles = []
    for f in dom.getElementsByTagName("FILM"):
        if len(f.getElementsByTagName("RESUME")):
            t = f.getElementsByTagName("TITRE")[0]
            titles.append(getText(t))
    return titles

ans = dom_query_5_1(dom)
print("5.1\t", ans)

ans = dom_query_5_2(dom)
print("5.2\t",ans)

ans = dom_query_5_3(dom)
print("5.3\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 6
# Les titres des films qui n'ont pas de résumé.

def dom_query_6(dom):
    titles = []
    for f in dom.getElementsByTagName("FILM"):
        if not len(f.getElementsByTagName("RESUME")):
            t = f.getElementsByTagName("TITRE")[0]
            titles.append(getText(t))
    return titles

ans = dom_query_6(dom)
print("6.1\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 7
# Les titres des films vieux de plus de trente ans.


def dom_query_7(dom, present, gap):
    titles = []
    for f in dom.getElementsByTagName("FILM"):
        if f.hasAttribute("Annee") and present-int(f.getAttribute("Annee"))>gap:
            for t in f.getElementsByTagName("TITRE"):
                titles.append(getText(t))
    return titles

ans = dom_query_7(dom, 2022, 30)
print("7.1\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 8
# Quel rôle joue Harvey Keitel dans Reservoir dogs ?

def dom_query_8(dom, title, nom, prenom):
    for f in dom.getElementsByTagName("FILM"):
        if not getText(f.getElementsByTagName("TITRE")[0]) == title: 
            continue
        for roles in f.getElementsByTagName("ROLES"):
            for r in roles.getElementsByTagName("ROLE"):
                prenom_role = r.getElementsByTagName("PRENOM")[0]
                nom_role = r.getElementsByTagName("NOM")[0]
                if getText(prenom_role) == prenom and getText(nom_role) == nom:
                    return getText(r.getElementsByTagName("INTITULE")[0])
    return None

ans = dom_query_8(dom, "Reservoir Dogs", "Keitel", "Harvey")
print("8.1\t",ans)

print("----"*20)

# ----------------------------------------------------------------------------
# Query 9
# Quel est le dernier film du document ?

def dom_query_9(dom):
    films = dom.getElementsByTagName("FILM")
    return getText(films[-1].getElementsByTagName("TITRE")[0])

ans = dom_query_9(dom)
print("9.1\t",ans)