Newbie and trying to load data: Help!

Hi all,

I'm a newbie to neo4j and trying to use it to create a graph database for a University thesis I am writing on news analysis.

I have a CSV file that has approx. 23,000 rows across 6 columns which I'm trying to load via python into my neo4j database.

Loading this into neo4j is taking an absolute age, my current run has loaded 590,000 tweets after 12 hours but I estimate there will be between 1.5 and 2.0 million tweets.
What am I doing wrong that is causing this to take so long?

I've listed the structure of what I'm trying to acheive below:

Graph Model:

News Nodes
(:News {
id: 'politifact-14742',
title: 'Snapchat is shutting down!',
url: 'http://...',
source: 'politifact',
label: 'fake',
tweets_count: 20
})

Tweet Nodes
(:Tweet {
id: '248149691466911744'
})

Relationship
(:Tweet)-[:MENTIONS]->(:News)

Here is my code in python too:

import pandas as pd
from neo4j import GraphDatabase
import math

Neo4j connection info

uri = "url"
username = "username"
password = "password"

csv_path = "path"

driver = GraphDatabase.driver(uri, auth=(username, password))

def chunker(seq, size):
"""Split list into chunks."""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def load_news_batch(news_data):
with driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
UNWIND $batch AS row
MERGE (n:News {id: row.id})
SET n.title = row.title,
n.url = row.url,
n.source = row.source,
n.label = row.label,
n.tweets_count = row.tweets_count
""",
batch=news_data
)
)
print(f"Loaded {len(news_data)} News nodes.")

def load_tweets_batch(tweet_data):
with driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
UNWIND $batch AS row
MERGE (t:Tweet {id: row.id})
""",
batch=tweet_data
)
)
print(f"Loaded {len(tweet_data)} Tweet nodes.")

def load_mentions_batch(mention_data):
with driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
UNWIND $batch AS row
MATCH (t:Tweet {id: row.tweet_id})
MATCH (n:News {id: row.news_id})
MERGE (t)-[:MENTIONS]->(n)
""",
batch=mention_data
)
)
print(f"Created {len(mention_data)} MENTIONS relationships.")

def main(batch_size=5000):
# Wipe existing graph
wipe_database()

# Read CSV
df = pd.read_csv(csv_path)
print(f"Loaded CSV with {df.shape[0]} rows.")

# Prepare News nodes
news_batch = []
tweet_batch = []
mention_batch = []

for idx, row in df.iterrows():
    news_id = str(row["id"])
    url = str(row["news_url"]) if not pd.isna(row["news_url"]) else ""
    title = str(row["title"]) if not pd.isna(row["title"]) else ""
    source = str(row["source"]) if not pd.isna(row["source"]) else ""
    label = str(row["label"]) if not pd.isna(row["label"]) else ""
    tweet_ids_str = str(row["tweet_ids"])
    tweet_ids = tweet_ids_str.strip().split("\t") if tweet_ids_str.strip() else []
    tweets_count = len(tweet_ids)

    news_batch.append({
        "id": news_id,
        "url": url,
        "title": title,
        "source": source,
        "label": label,
        "tweets_count": tweets_count
    })

    for tweet_id in tweet_ids:
        tweet_id = tweet_id.strip()
        if tweet_id:
            tweet_batch.append({"id": tweet_id})
            mention_batch.append({
                "tweet_id": tweet_id,
                "news_id": news_id
            })

print(f"Prepared {len(news_batch)} News nodes.")
print(f"Prepared {len(tweet_batch)} Tweet nodes.")
print(f"Prepared {len(mention_batch)} MENTIONS relationships.")

# Batch load News
for batch in chunker(news_batch, batch_size):
    load_news_batch(batch)

# Batch load Tweets
for batch in chunker(tweet_batch, batch_size):
    load_tweets_batch(batch)

# Batch load Relationships
for batch in chunker(mention_batch, batch_size):
    load_mentions_batch(batch)

print("All data loaded successfully!")

if name == "main":
main(batch_size=5000)

I looked through the foum and found my issue - I needed to create a UNIQUE CONSTRAINT on the ID property.

I updated my code to include the constraints and it loaded in under 10 minutes :smiley:

Here is the update:
def create_constraints():
with driver.session() as session:
session.execute_write(
lambda tx: tx.run("""
CREATE CONSTRAINT news_id_unique IF NOT EXISTS
FOR (n:News)
REQUIRE n.id IS UNIQUE
""")
)
session.execute_write(
lambda tx: tx.run("""
CREATE CONSTRAINT tweet_id_unique IF NOT EXISTS
FOR (t:Tweet)
REQUIRE t.id IS UNIQUE
""")
)
print("Constraints created.")

Just sense checking with this forum though if this is the right thing?

@jdj1907

your cypher includes many MERGE statements. MERGE is effectively a update or create and for updates to be fast ideally having an index on said label and property will make the MERGE fast. Otherwise with no index for each row in the csv we would need to examine every node for the given label.

This is described at MERGE - Cypher Manual
and specifically

For performance reasons, creating a schema index on the label
or property is highly recommended when using MERGE. 
See Create, show, and delete indexes for more information.

for what its worth a constraint is effectively an INDEX + some 'restriction', i.e. uniqueness etc.