Hi all,
I'm a newbie to neo4j and trying to use it to create a graph database for a University thesis I am writing on news analysis.
I have a CSV file that has approx. 23,000 rows across 6 columns which I'm trying to load via python into my neo4j database.
Loading this into neo4j is taking an absolute age, my current run has loaded 590,000 tweets after 12 hours but I estimate there will be between 1.5 and 2.0 million tweets.
What am I doing wrong that is causing this to take so long?
I've listed the structure of what I'm trying to acheive below:
Graph Model:
News Nodes
(:News {
id: 'politifact-14742',
title: 'Snapchat is shutting down!',
url: 'http://...',
source: 'politifact',
label: 'fake',
tweets_count: 20
})
Tweet Nodes
(:Tweet {
id: '248149691466911744'
})
Relationship
(:Tweet)-[:MENTIONS]->(:News)
Here is my code in python too:
import pandas as pd
from neo4j import GraphDatabase
import math
Neo4j connection info
uri = "url"
username = "username"
password = "password"
csv_path = "path"
driver = GraphDatabase.driver(uri, auth=(username, password))
def chunker(seq, size):
"""Split list into chunks."""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def load_news_batch(news_data):
with driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
UNWIND $batch AS row
MERGE (n:News {id: row.id})
SET n.title = row.title,
n.url = row.url,
n.source = row.source,
n.label = row.label,
n.tweets_count = row.tweets_count
""",
batch=news_data
)
)
print(f"Loaded {len(news_data)} News nodes.")
def load_tweets_batch(tweet_data):
with driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
UNWIND $batch AS row
MERGE (t:Tweet {id: row.id})
""",
batch=tweet_data
)
)
print(f"Loaded {len(tweet_data)} Tweet nodes.")
def load_mentions_batch(mention_data):
with driver.session() as session:
session.execute_write(
lambda tx: tx.run(
"""
UNWIND $batch AS row
MATCH (t:Tweet {id: row.tweet_id})
MATCH (n:News {id: row.news_id})
MERGE (t)-[:MENTIONS]->(n)
""",
batch=mention_data
)
)
print(f"Created {len(mention_data)} MENTIONS relationships.")
def main(batch_size=5000):
# Wipe existing graph
wipe_database()
# Read CSV
df = pd.read_csv(csv_path)
print(f"Loaded CSV with {df.shape[0]} rows.")
# Prepare News nodes
news_batch = []
tweet_batch = []
mention_batch = []
for idx, row in df.iterrows():
news_id = str(row["id"])
url = str(row["news_url"]) if not pd.isna(row["news_url"]) else ""
title = str(row["title"]) if not pd.isna(row["title"]) else ""
source = str(row["source"]) if not pd.isna(row["source"]) else ""
label = str(row["label"]) if not pd.isna(row["label"]) else ""
tweet_ids_str = str(row["tweet_ids"])
tweet_ids = tweet_ids_str.strip().split("\t") if tweet_ids_str.strip() else []
tweets_count = len(tweet_ids)
news_batch.append({
"id": news_id,
"url": url,
"title": title,
"source": source,
"label": label,
"tweets_count": tweets_count
})
for tweet_id in tweet_ids:
tweet_id = tweet_id.strip()
if tweet_id:
tweet_batch.append({"id": tweet_id})
mention_batch.append({
"tweet_id": tweet_id,
"news_id": news_id
})
print(f"Prepared {len(news_batch)} News nodes.")
print(f"Prepared {len(tweet_batch)} Tweet nodes.")
print(f"Prepared {len(mention_batch)} MENTIONS relationships.")
# Batch load News
for batch in chunker(news_batch, batch_size):
load_news_batch(batch)
# Batch load Tweets
for batch in chunker(tweet_batch, batch_size):
load_tweets_batch(batch)
# Batch load Relationships
for batch in chunker(mention_batch, batch_size):
load_mentions_batch(batch)
print("All data loaded successfully!")
if name == "main":
main(batch_size=5000)