Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Combining large xml files efficiently with python

Tags:

python

lxml

I have about 200 xml files ranging from 5MB to 50MB, with 80% being <10MB. These files contain multiple elements with both overlapping and unique data. My goal is to combine all this files, by performing a logical union over all the elements.

The code seems to work but gets exponentially slower the more files it has to process. For example, it takes about 20sec to process the first 5 files, about a minute to process next five, about 5 min next five and so on, while also taking significantly more memory then the sum total of all the files. With the overall process running on the 4th hour as I type this.

This is, obviously a 'to be expected' effect, considering that lookup needs to happen on an ever larger tree. Still, I wonder if there are ways to at least diminish this effect.

I have tried implementing a form of simple caching, but I didnt notice any significant improvement.

I also tried multiprocessing, which does help, but adds extra complexity and pushed the problem to the hardware level, which does not feel very optimal.

Is there something I can do to improve the performance in any way?

Note: I had to obfuscate parts of code and data due to confidentiality reasons. Please dont hesitate to inform if it breaks the example

code:

import lxml.etree
from lxml.etree import Element

# Edit2: added timing prints

def process_elements(files: list[str],
                     indentifier: int) -> lxml.etree._Element | None:

    base_el = Element('BASE')
    i = 0
    cache = {}  # Edit1. Missed this line

    
    start = time.time()
    time_spent_reading = 0
    lookup_time = [0, 0]
    append_new_el_time = [0, ]
    cache_search_time = [0, 0]
    recursive_calls_counter = [0, ]


    for file in files:
        i += 1
        print(f"Process: {indentifier}, File {i} of {len(files)}: {file}")
        print("Reading file...")
        start_read = time.time()

        tree = lxml.etree.parse(f'data/{file}').getroot()
        print(f"Reading file took {time.time() - start_read} seconds")
        print("Since start: ", time.time() - start)

        packages = tree.find('BASE')
        

        print("Starting walk...")
        sart_walked = time.time()
        for package in packages:
            walk(package, base_el, cache, lookup_time,
                 append_new_el_time, cache_search_time)
            
        print(f"Walk took {time.time() - sart_walked} seconds")
        print("Since start: ", time.time() - start)

    if indentifier == -1:
        return base_el
    else:
        print("Timing results:")
        print("Time spent reading: ", time_spent_reading)
        print("Time spent on lookup: ", lookup_time[0])
        print("Time spent on append: ", append_new_el_time[0])
        print("Time spent on cache search: ", cache_search_time[0])

        base_el.getroottree().write(
            f'temp{indentifier}.xml', encoding='utf-8')
        return None
    

def walk(element: lxml.etree._Element,
         reference: lxml.etree._Element,
         cache: dictlookup_time,
         append_new_el_time,
         cache_search_time,
         recursive_calls_counter) -> None:

    recursive_calls_counter[0] += 1

    children = element.iterchildren()
    elid = f"{element.tag}"
    element_name = element.get('some-id-i-need')
    if element_name is not None:
        elid += f'[@some-id-i-need="{element_name}"]'

    cache_id = str(id(reference)) + "_" + elid

    cache_search_time_start = time.time()
    relevant_data = cache.get(cache_id)
    cache_search_time[0] += time.time() - cache_search_time_start

    # if element is found either in cache or in the new merged object
    # continue to its children
    # otherwise, element does not exist in merged object. 
    # Add it to the merged object and to cache

    if relevant_data is None:
        # I believe this lookup may be what takes the most time
        # hence my attempt to cache this
        lookup_time_start = time.time()
        relevant_data = reference.find(elid)   
        lookup_time[0] += time.time() - lookup_time_start
        lookup_time[1] += 1
    else:
        # cache hit
        cache_search_time[1] += 1

    if relevant_data is None:
        append_new_el_time_start = time.time()
        reference.append(element)
        append_new_el_time[0] += time.time() - append_new_el_time_start
        return

    else:
        cache.setdefault(cache_id, relevant_data)
        # if element has no children, loop will not run
        for child in children:
            walk(child, relevant_data, cache, lookup_time,
                 append_new_el_time,
                 cache_search_time,
                 recursive_calls_counter)


# to run: process_elements(os.listdir("data"), -1)

example data:

file1

<BASE>
    <elem id="1">
        <data-tag id="1">
            <object id="23124">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPV" />
                <Number type="S" />
            </object>
            <object id="128161">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPF" />
            </object>
        </data-tag>

    </elem>
</BASE>

file2

<BASE>
    <elem id="1">
        <data-tag id="1">
            <object id="23124">

                <concept type="t1" />
            </object>
            <object id="128161">

                <concept type="t2" />
            </object>
        </data-tag>
        <data-tag id="2">
            <object id="128162">
                <POS Tag="P" />
                <grammar type="PREFIX" />
                <Tag Tag="bi+" />
                <concept type="t3" />
            </object>
        </data-tag>
    </elem>
</BASE>

result:

<BASE>
    <elem id="1">
        <data-tag id="1">
            <object id="23124">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPV" />
                <Number type="S" />
                <concept type="t1" />
            </object>
            <object id="128161">
                <POS Tag="V" />
                <grammar type="STEM" />
                <Aspect type="IMPF" />
                <concept type="t2" />
            </object>
        </data-tag>
        <data-tag id="2">
            <object id="128162">
                <POS Tag="P" />
                <grammar type="PREFIX" />
                <Tag Tag="bi+" />
                <concept type="t3" />
            </object>
        </data-tag>
    </elem>
</BASE>

Edit2: Timing results after processing 10 files (about 60MB, 1m 24.8s):


Starting process...
Process: 102, File 1 of 10:
Reading file...
Reading file took 0.1326887607574463 seconds
Since start:  0.1326887607574463
preprocesing...
merging...
Starting walk...
Walk took 0.8433401584625244 seconds
Since start:  1.0600318908691406
Process: 102, File 2 of 10:
Reading file...
Reading file took 0.04700827598571777 seconds
Since start:  1.1070401668548584
preprocesing...
merging...
Starting walk...
Walk took 1.733034610748291 seconds
Since start:  2.8680694103240967
Process: 102, File 3 of 10:
Reading file...
Reading file took 0.041702985763549805 seconds
Since start:  2.9097723960876465
preprocesing...
merging...
...
Time spent on lookup:  79.53011083602905
Time spent on append:  1.1502337455749512
Time spent on cache search:  0.11017322540283203
Cache size:  30176

# Edit3: extra data
Number of cache hits:  112503
Cache size:  30177
Number of recursive calls:  168063


As an observation, I do expect significant overlap between the files, maybe the small cache search time indicates that something is wrong with how I implemented caching?

Edit3: It does seem that I do get a lot of hits. but the strange part is that if I comment out the cache search part, it makes almost no difference in performance. In fact, it ran marginally faster without it (although not sure if a few seconds is a significant difference or just random chance in this case)

relevant_data = None  # cache.get(cache_id)

log with cache commented out:

Time spent on lookup:  71.13456320762634
Number of lookups:  168063
Time spent on append:  3.9656710624694824
Time spent on cache search:  0.020023584365844727
Number of cache hits:  0
Cache size:  30177
Number of recursive calls:  168063
like image 784
Petru Tanas Avatar asked Nov 03 '25 07:11

Petru Tanas


1 Answers

Caching all identifiers while proceeding seems to work well and doesn't significantly slow down as more data is added.

The following code does this:

def xml_union(files, loader):
    existing = {}
    path = []

    def populatewalk(elem):
        pid = elem.get('id')
        ident = (elem.tag, pid)
        path.append(ident)
        if pid is not None:
            existing[tuple(path)] = elem
        for child in elem:
            populatewalk(child)
        popped = path.pop()
        assert popped is ident

    def walk(existing_parent, elem):
        pid = elem.get('id')
        
        if pid is None:
            existing_parent.append(elem)
            # make sure children are populated
            return populatewalk(elem)

        ident = (elem.tag, pid)
        path.append(ident)
        tpath = tuple(path)
        existing_elem = existing.get(tpath)
        if existing_elem is None:
            existing_parent.append(elem)
            existing[tpath] = elem
            for child in elem:
                populatewalk(child)
        else:
            existing_elem.attrib.update(elem.items())
            for child in elem:
                walk(existing_elem, child)

        popped = path.pop()
        assert popped is ident        

    first, *remain = files
    root = loader(first)
    for elem in root:
        populatewalk(elem)

    for text in remain:
        ri = loader(text)
        if root.tag != ri.tag:
            raise ValueError(f"root tag {root.tag!r} does not equal {ri.tag!r}")
        for elem in ri:
            walk(root, elem)
    
    return root

The above code assumes that you always want to use an id attribute to identify elements, but that should be easy to change. It also is slightly more general in that it keeps track of the element hierarchy of when doing the union, while your code only seems to care that an element with a given ID can be found. Not sure if that matters!

This can be tested with the following line, with f1 and f2 set as strings containing the tests you sent above.

print(etree.tostring(xml_union([f1, f2], etree.fromstring)).decode())

Writing this didn't take too long, but convincing myself it's somewhat correct and performant took longer. I ended up writing a test harness that generates 10 files that are ~12MiB, runs the above code on these files, then writes the result to a ~87MiB file, then makes sure that file is exactly the union of what was generated. The part that uses xml_union looks like:

from time import time

def fileloader(path):
    print(f"loading {path}")
    return etree.parse(path).getroot()

t0 = time()
new_root = xml_union(
    [f'large-{i:02}.xml' for i in range(10)],
    fileloader,
)
t1 = time()
with open(f'merged.xml', 'wb') as fd:
    print("writing merged")
    etree.ElementTree(new_root).write(fd, pretty_print=True)
t2 = time()

print(f"union={t1-t0:.2f} write={t2-t1:.2f}")

My 1.6GHz laptop takes ~23 seconds to merge these files, with no slowdown noticed with later files. Writing the resulting object takes Python ~2 seconds.

The test harness is much more fiddly, and looks like:

from itertools import product
from random import choices

def randomizer():
    num = 1
    def next(n, rb):
        nonlocal num
        for _ in range(n):
            yield num, choices(rb, k=len(terminals))
            num += 1
    return next

rootids = list(range(10))
roots = [etree.Element('BASE') for _ in rootids]
obj_elems = {}
dt_elems = {}
el_elems = {}

def get_obj(root_id, el_id, dt_id, obj_id):
    obj = obj_elems.get((root_id, obj_id))
    if obj is not None:
        return obj

    obj = obj_elems[(root_id, obj_id)] = etree.Element('object', id=str(obj_id))
    dt = dt_elems.get((root_id, dt_id))
    if dt is not None:
        dt.append(obj)
        return obj

    dt = dt_elems[(root_id, dt_id)] = etree.Element('data-tag', id=str(dt_id))
    dt.append(obj)
    el = el_elems.get((root_id, el_id))
    if el is not None:
        el.append(dt)
        return obj

    el = el_elems[(root_id, el_id)] = etree.Element('elem', id=str(el_id))
    el.append(dt)
    roots[root_id].append(el)
    return obj

elmaker = randomizer()
dtmaker = randomizer()
objmaker = randomizer()

for el_id, el_roots in elmaker(1000, rootids):
    for dt_id, dt_roots in dtmaker(100, el_roots):
        for obj_id, obj_roots in objmaker(len(terminals), dt_roots):
            for key, root_id in zip(terminals, obj_roots):
                get_obj(root_id, el_id, dt_id, obj_id).append(
                    etree.Element(key, an='val')
                )

for root_id, root in zip(rootids, roots):
    with open(f'large-{root_id:02}.xml', 'wb') as fd:
        et = etree.ElementTree(root)
        et.write(fd, pretty_print=True)

nelem = 1000
ndt = 100
nterm = len(terminals)

expected_elems = set(str(i+1) for i in range(nelem))
expected_dts = set(str(i+1) for i in range(nelem*ndt))
expected_objs = set(str(i+1) for i in range(nelem*ndt*nterm))
expected_terms = set(product(expected_objs, terminals))

elem_seen = set()
dt_seen = set()
obj_seen = set()
terms_seen = set()

def check(el, tag, seen):
    assert el.tag == tag
    aid = el.attrib['id']
    assert aid not in seen
    seen.add(aid)
    return aid

for elem in etree.parse('merged.xml').getroot():
    check(elem, 'elem', elem_seen)
    for dt in elem:
        check(dt, 'data-tag', dt_seen)
        for obj in dt:
            obj_id = check(obj, 'object', obj_seen)
            for term in obj:
                assert term.tag in terminals
                term_id = (obj_id, term.tag)
                assert term_id not in terms_seen
                terms_seen.add(term_id)

assert elem_seen == expected_elems
assert dt_seen == expected_dts
assert obj_seen == expected_objs
assert terms_seen == expected_terms

Hopefully that test harness is useful to somebody else!

like image 101
Sam Mason Avatar answered Nov 04 '25 21:11

Sam Mason