cancel
Showing results for 
Search instead for 
Did you mean: 

Unable to add properties to nodes

Hello, I have trouble adding properties to nodes. I'm currently working with this tutorial: Create a graph database in Neo4j using Python | by CJ Sullivan | Towards Data Science but as I am completely new to Neo4j, I do not know how to add properties to nodes. I'm working on the arXiv dataset (arXiv Dataset | Kaggle) and I wanted to add the creation date of papers as a property to the paper nodes in the graph. Here is my notebook (I'm using Neo4j Blank Sandboxes for node display):

pip install neo4j

pip install pandas

import neo4j
from neo4j import GraphDatabase
import pandas as pd
import json
import time
from datetime import datetime

data = 'arXiv-data/test.json'
metadata = []
lines = 149
with open(data, 'r') as f:
  for line in f:
        metadata.append(json.loads(line))
        lines -= 1
        if lines == 0: break
        
df = pd.DataFrame(metadata)
df.dtypes

def get_author_list(line):
    # Cleans author dataframe column, creating a list of authors in the row.
    return [e[1] + ' ' + e[0] for e in line]
df['cleaned_authors_list'] = df['authors_parsed'].map(get_author_list)
df['created_date'] = [datetime.strptime(date[0]['created'].split(',')[1],' %d %b %Y %H:%M:%S %Z') 
                               for date in df['versions']]
df.drop(['submitter', 'authors', 'title', 'journal-ref', 'doi', 'report-no', 'comments', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'], axis = 1)

class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response


conn = Neo4jConnection(uri="bolt://44.197.113.107:7687", user="neo4j",pwd="delight-hardships-mitt")

conn.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper) ASSERT p.id IS UNIQUE')
conn.query('CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE')

def add_authors(rows, batch_size=10000):
    # Adds author nodes to the Neo4j graph as a batch job.
    query = '''
            UNWIND $rows AS row
            MERGE (:Author {name: row.author})
            RETURN count(*) as total
            '''
    return insert_data(query, rows, batch_size)


def insert_data(query, rows, batch_size = 10000):
    # Function to handle the updating the Neo4j database in batch mode.
    
    total = 0
    batch = 0
    start = time.time()
    result = None
    
    while batch * batch_size < len(rows):

        res = conn.query(query, 
                         parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"total":total, 
                  "batches":batch, 
                  "time":time.time()-start}
        print(result)
        
    return result

def add_papers(rows, batch_size=50):
   # Adds paper nodes and (:Author)--(:Paper) relationships to the Neo4j graph as a 
   # batch job.
 
   query = '''
   UNWIND $rows as row
   MERGE (p:Paper {id:row.id}) ON CREATE SET p.title = row.title, p.date = row.created_date
 
   // connect authors
   WITH distinct row, p // reduce cardinality
   UNWIND row.cleaned_authors_list AS author
   MATCH (a:Author {name: author})
   MERGE (a)-[:AUTHORED]->(p)
   RETURN count(distinct p) as total
   '''
 
   return insert_data(query, rows, batch_size)

authors = pd.DataFrame(df[['cleaned_authors_list']])
authors.rename(columns={'cleaned_authors_list':'author'},
               inplace=True)
authors=authors.explode('author').drop_duplicates(subset=['author'])

add_authors(authors)
add_papers(df)

After trying to run the notebook, I get the following error message:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [10], in <module>
      4 authors=authors.explode('author').drop_duplicates(subset=['author'])
      6 add_authors(authors)
----> 7 add_papers(df)

Input In [9], in add_papers(rows, batch_size)
      1 def add_papers(rows, batch_size=50):
      2    # Adds paper nodes and (:Author)--(:Paper) relationships to the Neo4j graph as a 
      3    # batch job.
      5    query = '''
      6    UNWIND $rows as row
      7    MERGE (p:Paper {id:row.id}) ON CREATE 
   (...)
     16    RETURN count(distinct p) as total
     17    '''
---> 19    return insert_data(query, rows, batch_size)

Input In [8], in insert_data(query, rows, batch_size)
     19 while batch * batch_size < len(rows):
     21     res = conn.query(query, 
     22                      parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
---> 23     total += res[0]['total']
     24     batch += 1
     25     result = {"total":total, 
     26               "batches":batch, 
     27               "time":time.time()-start}

TypeError: 'NoneType' object is not subscriptable

Any help would be highly appreciated!

1 REPLY 1

anthapu
Graph Fellow

You can leverage a python ingest utility like this ( neo4j-field/pyingest (github.com)).

This will reduce the need to write custom code to ingest the data.