Performance Issues with Insertion from GraphML to Neo4j

Hi Team,

I'm currently working on processing a large GraphML file (approximately 500 MB) and inserting its data into a Neo4j database. Since GraphML is not directly compatible with Neo4j, I've developed a custom parser to extract nodes and edges, which I then insert using Cypher queries.

However, I’m experiencing significant performance issues during the insertion process. Here are some details:

Performance Metrics:

    • Single Thread: Approximately 3.5 hours
    • Multiple Threads: Approximately 1.5 hours

Example GraphML Snippet:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<graphml xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://www.yworks.com/xml/schema/graphml.html/2.0/ygraphml.xsd " xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:y="http://www.yworks.com/xml/yfiles-common/3.0" xmlns:x="http://www.yworks.com/xml/yfiles-common/markup/3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <key id="nl" for="node" attr.name="NodeLabels"/>
  <key id="el" for="edge" attr.name="EdgeLabels"/>
  <key id="lineNumberData" for="edge"/>
  <key id="filenameData" for="edge"/>
  <graph edgedefault="directed">
    <node id="m00000">
      <data key="nl"><x:List><y:Label><y:Label.Text>bcc_sw3_init_bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <graph edgedefault="directed">
      </graph>
    </node>
    <node id="m00001">
      <data key="nl"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <graph edgedefault="directed">
        <node id="n00000000">
          <data key="nl"><x:List><y:Label><y:Label.Text>BSIG_Number_of_Temperatures</y:Label.Text></y:Label></x:List></data>
        </node>
        <node id="n00000002">
          <data key="nl"><x:List><y:Label><y:Label.Text>BmsIn_TpVal</y:Label.Text></y:Label></x:List></data>
        </node>
        <node id="n00000006">
          <data key="nl"><x:List><y:Label><y:Label.Text>Sbccsig321_Sum2</y:Label.Text></y:Label></x:List></data>
        </node>
        <node id="n00000008">
          <data key="nl"><x:List><y:Label><y:Label.Text>BmsIn_TpCell</y:Label.Text></y:Label></x:List></data>
        </node>
        <node id="n00000011">
          <data key="nl"><x:List><y:Label><y:Label.Text>BmsIn_NrOfMaxBatTp</y:Label.Text></y:Label></x:List></data>
        </node>
	  </graph>
	</node>
    <edge source="n00000000" target="n00000002" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
    <edge source="n00000000" target="n00000002" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
    <edge source="n00000000" target="n00000006" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
    <edge source="n00000000" target="n00000008" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
    <edge source="n00000000" target="n00000008" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
    <edge source="n00000000" target="n00000011" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
    <edge source="n00000000" target="n00000011" >
      <data key="el"><x:List><y:Label><y:Label.Text>bcc_sig3</y:Label.Text></y:Label></x:List></data>
      <data key="lineNumberData">1123:1123</data>
      <data key="filenameData">320_Module/3201_BCC/01_BCC_SIG/Src/Src_BB2x0_H200SKI/bcc_sig3.c</data>
    </edge>
</graphml>
  1. Code:
 public void pushNodeDataToNeo4j(String lable, Map<Node, List<Node>> nodeHierarchy) {
        StringBuilder insertQuery = null;
        for (Map.Entry<Node, List<Node>> entry : nodeHierarchy.entrySet()) {
            try (var session = getNeo4jConnection().session()) {
                Node node = entry.getKey();
                List<Node> childNodes = entry.getValue();
                //Parent Node
                if (node != null) {
                    insertQuery = new StringBuilder().append("MERGE (s1:" + lable + " {name: '" + node.getName() + "'})\n").append("RETURN s1");
                    //Db call
                    session.executeWrite(tx -> {
                        var result = tx.run(insertQuery);
                        return "Success";
                    });
                }
                //Child Nodes
                if (childNodes.size() > 0) {
                    for (Node childNode : childNodes) {
                        insertQuery = new StringBuilder().append("MERGE (s1:" + lable + " {name: '" + childNode.getName() + "'})\n")
                                .append("MERGE (s2:" + lable + " {name: '" + node.getName() + "'})\n")
                                .append("MERGE (s1)-[r1:PARENT]->(s2)")
                                .append("MERGE (s2)-[r2:CHILD]->(s1)")
                                .append("RETURN s1,s2,r1,r2");
                        //Db call
                        session.executeWrite(tx -> {
                            var result = tx.run(insertQuery);
                            return "Success";
                        });
                    }
                }
            }
        }
    }
public void pushEdgeDataToNeo4j(String databaseName, List<Edge> edgeData){
        try (var session = getNeo4jConnection().session()) {
            for(Edge edge:edgeData){
                StringBuilder insertQuery=new StringBuilder();
                insertQuery.append("MATCH (s1:"+ databaseName +" {name: '"+edge.getSource().getName()+"'})\n")
                        .append("MATCH (s2:"+ databaseName+"{name: '"+edge.getTarget().getName()+"'})\n")
                        .append("MERGE (s1)-[r:LINK {source:'"+edge.getSource().getName()+"',target:'"+edge.getTarget().getName()+"',name:'"+edge.getName()+"', lineNumber:'"+edge.getLineNumber()+"', filePath:'"+edge.getFilePath()+"'}]->(s2)\n")
                        .append("RETURN s1,s2");

                session.executeWrite(tx -> {
                    var result = tx.run(insertQuery.toString());
                    return "Success";
                });
            }
        }
    } 

Environment:

  • Neo4j Version: 5.22.0
  • Neo4j Driver: neo4j-java-driver

Questions:

  1. What strategies can I employ to optimize the insertion process?
  2. Are there specific Cypher query optimizations I should consider?

I appreciate any insights or suggestions that could help improve the performance of this data insertion process. Thank you!

Note: To parse graphml using java sax parser - its taking 1-2 minutes to parse

Regards
Jayachand