Error by invoking gds.beta.pipeline.linkPrediction.train procedure. Node properties do not exist in the graph. But they do!

My graph:

CALL gds.graph.project(
    'myGraph1',
    {
        Project: {
            label: 'Project',
            properties: [
                'projectPlannedRevenue', 
                'projectRevenueRatio', 
                'projectOverallSlippagePercent', 
                'projectCostRatio', 
                'projectTeamSize', 
                'projectDuration', 
                'totalRiskValue', 
                'avgRiskValue', 
                'riskCount', 
                'revenueToCostRatio',
                'embedding'
            ]
        },
        Risk: {
            label: 'Risk',
            properties: [
                'riskValue',  
                'riskDuration',
                'embedding'
            ]
        }
    },
    {
        HAS_RISK: {
            type: 'HAS_RISK',
            orientation: 'UNDIRECTED'
        }
    }
);

my train query:

CALL gds.beta.pipeline.linkPrediction.train('myGraph1', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  metrics: ['AUCPR', 'OUT_OF_BAG_ERROR'],
  targetRelationshipType: 'HAS_RISK',
  randomSeed: 12
}) YIELD modelInfo, modelSelectionStats
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
  modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
  modelInfo.metrics.AUCPR.test AS testScore,
  [cand IN modelSelectionStats.modelCandidates | cand.metrics.AUCPR.validation.avg] AS validationScores

Hello @alfeeva.anastasia ,
Could you share your whole pipeline configuration and your projected graph schema (`gds.graph.list('myGraph1')?

sure, here you go :) @florentin_dorre

CALL gds.beta.pipeline.linkPrediction.create('pipe')

CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  randomSeed: 42
})

 CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding', 'projectPlannedRevenue', 'projectRevenueRatio', 'projectOverallSlippagePercent',
        'projectCostRatio', 'projectTeamSize', 'projectDuration',
        'totalRiskValue', 'avgRiskValue', 'riskCount', 'revenueToCostRatio', 'riskValue',  'riskDuration']
}) YIELD featureSteps 

CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.25,
  trainFraction: 0.6,
  validationFolds: 3
})
YIELD splitConfig

CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe')
YIELD parameterSpace

CALL gds.beta.pipeline.linkPrediction.addRandomForest('pipe', {numberOfDecisionTrees: 10})
YIELD parameterSpace

CALL gds.alpha.pipeline.linkPrediction.configureAutoTuning('pipe', {
  maxTrials: 2
}) YIELD autoTuningConfig

//(7) Project another graph
CALL gds.graph.project(
    'myGraph1',
    {
        Project: {
            label: 'Project',
            properties: [
                'projectPlannedRevenue', 
                'projectRevenueRatio', 
                'projectOverallSlippagePercent', 
                'projectCostRatio', 
                'projectTeamSize', 
                'projectDuration', 
                'totalRiskValue', 
                'avgRiskValue', 
                'riskCount', 
                'revenueToCostRatio',
                'embedding'
            ]
        },
        Risk: {
            label: 'Risk',
            properties: [
                'riskValue',  
                'riskDuration',
                'embedding'
            ]
        }
    },
    {
        HAS_RISK: {
            type: 'HAS_RISK',
            orientation: 'UNDIRECTED'
        }
    }
);

CALL gds.beta.pipeline.linkPrediction.train('myGraph1', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  metrics: ['AUCPR', 'OUT_OF_BAG_ERROR'],
  targetRelationshipType: 'HAS_RISK',
  randomSeed: 12
}) YIELD modelInfo, modelSelectionStats
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
  modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
  modelInfo.metrics.AUCPR.test AS testScore,
  [cand IN modelSelectionStats.modelCandidates | cand.metrics.AUCPR.validation.avg] AS validationScores

and the graph shema @florentin_dorre :

degreeDistribution,graphName,database,databaseLocation,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema,schemaWithOrientation
"{min: 1, max: 97, p90: 2, p999: 97, p99: 8, p50: 1, p75: 1, p95: 4, mean: 1.640625}","""myGraph1""","""neo4j""","""local""","""721 KiB""",739066,512,840,"{relationshipProjection: {HAS_RISK: {aggregation: ""DEFAULT"", orientation: ""UNDIRECTED"", indexInverse: false, properties: {}, type: ""HAS_RISK""}}, readConcurrency: 4, relationshipProperties: {}, nodeProperties: {}, jobId: ""0dfdacc4-e3b6-4b32-b4d8-c4fbfd19438e"", nodeProjection: {Risk: {label: ""Risk"", properties: {riskDuration: {property: ""riskDuration"", defaultValue: null}, riskValue: {property: ""riskValue"", defaultValue: null}, embedding: {property: ""embedding"", defaultValue: null}}}, Project: {label: ""Project"", properties: {projectDuration: {property: ""projectDuration"", defaultValue: null}, projectOverallSlippagePercent: {property: ""projectOverallSlippagePercent"", defaultValue: null}, totalRiskValue: {property: ""totalRiskValue"", defaultValue: null}, projectCostRatio: {property: ""projectCostRatio"", defaultValue: null}, avgRiskValue: {property: ""avgRiskValue"", defaultValue: null}, projectPlannedRevenue: {property: ""projectPlannedRevenue"", defaultValue: null}, revenueToCostRatio: {property: ""revenueToCostRatio"", defaultValue: null}, projectTeamSize: {property: ""projectTeamSize"", defaultValue: null}, embedding: {property: ""embedding"", defaultValue: null}, projectRevenueRatio: {property: ""projectRevenueRatio"", defaultValue: null}, riskCount: {property: ""riskCount"", defaultValue: null}}}}, logProgress: true, creationTime: ""2024-05-14T22:16:54.144203000Z"", validateRelationships: false, sudo: false}",0.003210616438356164,"""2024-05-14T22:16:54.144203000Z""","""2024-05-14T22:16:54.152373000Z""","{graphProperties: {}, nodes: {Risk: {riskDuration: ""Integer (DefaultValue(-9223372036854775808), PERSISTENT)"", riskValue: ""Float (DefaultValue(NaN), PERSISTENT)"", embedding: ""List of Float (DefaultValue(null), PERSISTENT)""}, Project: {projectDuration: ""Integer (DefaultValue(-9223372036854775808), PERSISTENT)"", projectOverallSlippagePercent: ""Float (DefaultValue(NaN), PERSISTENT)"", totalRiskValue: ""Float (DefaultValue(NaN), PERSISTENT)"", projectCostRatio: ""Float (DefaultValue(NaN), PERSISTENT)"", avgRiskValue: ""Float (DefaultValue(NaN), PERSISTENT)"", projectPlannedRevenue: ""Float (DefaultValue(NaN), PERSISTENT)"", revenueToCostRatio: ""Float (DefaultValue(NaN), PERSISTENT)"", projectTeamSize: ""Float (DefaultValue(NaN), PERSISTENT)"", embedding: ""List of Float (DefaultValue(null), PERSISTENT)"", projectRevenueRatio: ""Float (DefaultValue(NaN), PERSISTENT)"", riskCount: ""Integer (DefaultValue(-9223372036854775808), PERSISTENT)""}}, relationships: {HAS_RISK: {}}}","{graphProperties: {}, nodes: {Risk: {riskDuration: ""Integer (DefaultValue(-9223372036854775808), PERSISTENT)"", riskValue: ""Float (DefaultValue(NaN), PERSISTENT)"", embedding: ""List of Float (DefaultValue(null), PERSISTENT)""}, Project: {projectDuration: ""Integer (DefaultValue(-9223372036854775808), PERSISTENT)"", projectOverallSlippagePercent: ""Float (DefaultValue(NaN), PERSISTENT)"", totalRiskValue: ""Float (DefaultValue(NaN), PERSISTENT)"", projectCostRatio: ""Float (DefaultValue(NaN), PERSISTENT)"", avgRiskValue: ""Float (DefaultValue(NaN), PERSISTENT)"", projectPlannedRevenue: ""Float (DefaultValue(NaN), PERSISTENT)"", revenueToCostRatio: ""Float (DefaultValue(NaN), PERSISTENT)"", projectTeamSize: ""Float (DefaultValue(NaN), PERSISTENT)"", embedding: ""List of Float (DefaultValue(null), PERSISTENT)"", projectRevenueRatio: ""Float (DefaultValue(NaN), PERSISTENT)"", riskCount: ""Integer (DefaultValue(-9223372036854775808), PERSISTENT)""}}, relationships: {HAS_RISK: {direction: ""UNDIRECTED"", properties: {}}}}"

@alfeeva.anastasia thank you for your queries.

I see the problem, that you define the node properties per label (Project and Risk) here but call train an all node labels.

Your defined features added in gds.beta.pipeline.linkPrediction.addFeature need to exist for all nodes.

Example: riskValue is only defined for Risk but not for Project. This should lead to the seen error message. (the wording can be improved)

For the different properties per label you could try different things.

  • As a baseline only use the embedding as a common property
  • Specify default values for the properties on the other label during the projection (such as {riskDuration: {defaultValue: 0.0}} for Project nodes)
  • Try a embedding algorithm which supports heterogeneous nodes such as GraphSage.

I would further recommend to specify sourceNodeLabel and targetNodeLabel in your train call, as you want to find Project -> Risk relationships. Otherwise Project -> Project links would also be recommended.

1 Like

Hi @florentin_dorre

Thank you very much for taking time to analyse my script and for your help.

A question regarding the embedding within the GraphSAGE:

In the example described here GraphSAGE - Neo4j Graph Data Science the KNOWS relationship has a weight:

(dan)-[:KNOWS {relWeight: 1.0}]->(annie),
(dan)-[:KNOWS {relWeight: 1.6}]->(matt),
(annie)-[:KNOWS {relWeight: 0.1}]->(matt),

I don`t have that weight in my HAS_RISK relationship between PROJECT and RISK nodes. Whould it still work? Or wouls it make sense to do the embedding with the Node2Vec embedding algorithm, where the weight is not required?

Hi @alfeeva.anastasia,

Relationship weight is completely optional (see relationshipWeightProperty in this table), and it definitely make sense to try GraphSAGE without weights.

But it could certainly also be interesting to try the other node embeddings algorithms :slight_smile:

Good luck,
Adam

1 Like

Thank you Adam! @adam_schill-col @florentin_dorre

Now I have another problem with my project. Somehow when I stream my pipeline, I am getting all same probabilities. Do you have an idea why and what can I do with that?

projectNumber riskDescription probability
32PO-41309 zusatzaufwand ib 0.4999999999999998
32PO-41309 test 0.4999999999999998
32PO-41309 100000 0.4999999999999998
P.01665 knm nacht und wochenendeinsatz 0.4999999999999998
P.01665 mgliches risiko 0.4999999999999998
P.01665 rc projekleiterwechsel 0.4999999999999998
32PO-41309 material missing 0.4999999999999998
P.01665 steigende materialpreise fr rohstoffe 0.4999999999999998
...

my script:

//.1. Load CSV and create all nodes and relationships
LOAD CSV WITH HEADERS FROM 'file:///LabeledFilteredOutput.csv' AS row

// Create Project Nodes
MERGE (p:Project {projectNumber: row.`Project Number`})
ON CREATE SET p.projectNumber = row.`Project Number`,
              p.projectActualRevenue = coalesce(toFloat(row.`Project Actual Revenue`), 0.0),
              p.projectPlannedRevenue = coalesce(toFloat(row.`Project Planned Revenue`), 0.0),
              p.projectRevenueRatio = coalesce(toFloat(row.`(%) Actual Project Revenue / Planned Project Revenue`), 0.0),
              p.projectBusinessSegment = row.`Business Segment of a Project`,
              p.projectCustomerRef = row.`Project Customer Ref`,
              p.projectOverallSlippage = coalesce(toFloat(row.`Project Overall Slippage`), 0.0),
              p.projectOverallSlippagePercent = coalesce(toFloat(row.`% Project Overall Slippage`), 0.0),
              p.projectCostRatio = coalesce(toFloat(row.`(%) Actual Project Cost / Planned Project Cost`), 0.0),
              p.projectTeamSize = coalesce(toFloat(row.`Project Team Size`), 0.0),
              p.projectFinishDate = date(row.`Project Finish Date`),
              p.projectCreatedOn = date(row.`Project Created On`),
              p.projectModifiedOn = date(row.`Project Modified On`),
              p.projectExecutionSlippage = coalesce(toFloat(row.`Project Execution Slippage`), 0.0),
              p.projectExecutionSlippagePercent = coalesce(toFloat(row.`% Project Execution Slippage`), 0.0),
              p.projectWarrantySlippage = coalesce(toFloat(row.`Project Warranty Slippage`), 0.0),
              p.projectWarrantySlippagePercent = coalesce(toFloat(row.`% Project Warranty Slippage`), 0.0)

// Create and link Risk Nodes with Risk Type, Status, and Label
MERGE (r:Risk {Risk: row.Risk})
ON CREATE SET r.riskDescription = row.`Risk Description`,
              r.riskType = row.`Risk Type`,
              r.riskCreatedOn = date(row.`Risk Created On`),
              r.riskModifiedOn = date(row.`Risk Modified On`),
              r.riskInitialValue = coalesce(toFloat(row.`Risk Initial Value`), 0.0),
              r.riskValue = coalesce(toFloat(row.`Risk Value`), 0.0),
              r.riskPcaId = row.`Risk PCA ID`,
              r.riskLabel = row.`Label`,
              r.riskDIStatus = row.`Risk DI Status`,
              r.riskStatus = row.`Risk Status`

// Link Risk to RiskLabel if label is not null
//FOREACH (ignoreMe IN CASE WHEN row.`Label` IS NOT NULL THEN [1] ELSE [] END |
//    FOREACH (label IN split(row.`Label`, ';') | // Assuming labels are separated by ';'
//        MERGE (l:RiskLabel {name: label})
//        MERGE (r)-[:HAS_LABEL]->(l)
//    )
//)

MERGE (p)-[:HAS_RISK]->(r);

//.2. Feature engineering
// Temporal Features: Calculate risk duration, project duration, and time to project finish
MATCH (p:Project)-[:HAS_RISK]->(r:Risk)
SET r.riskDuration = coalesce(duration.inDays(date(r.riskCreatedOn), date(r.riskModifiedOn)).days, 0),
    p.projectDuration = coalesce(duration.inDays(date(p.projectCreatedOn), date(p.projectFinishDate)).days, 0),
    r.timeToProjectFinish = coalesce(duration.inDays(date(r.riskCreatedOn), date(p.projectFinishDate)).days, 0);

// Risk Value Change: Calculate the change in risk value from initial to current
MATCH (r:Risk)
SET r.riskValueChange = coalesce(r.riskValue - r.riskInitialValue, 0.0);

// Aggregated Risk Metrics: Calculate total, average, and count of risk values per project
MATCH (p:Project)-[:HAS_RISK]->(r:Risk)
WITH p, SUM(coalesce(r.riskValue, 0.0)) AS totalRiskValue, AVG(coalesce(r.riskValue, 0.0)) AS avgRiskValue, COUNT(r) AS riskCount
SET p.totalRiskValue = coalesce(totalRiskValue, 0.0),
    p.avgRiskValue = coalesce(avgRiskValue, 0.0),
    p.riskCount = coalesce(riskCount, 0);

// Revenue and Cost Ratios: Calculate the ratio of actual revenue to planned costs
MATCH (p:Project)
SET p.revenueToCostRatio = coalesce(p.projectActualRevenue / p.projectPlannedRevenue, 0.0);

//.3. Project myGraph
CALL gds.graph.project(
    'myGraph',
    {
        Project: {
            label: 'Project',
            properties: [
                {riskDuration: {defaultValue: 0.0}},
                {projectDuration: {defaultValue: 0.0}},
                {projectOverallSlippagePercent: {defaultValue: 0.0}},
                {projectCostRatio: {defaultValue: 0.0}},
                {avgRiskValue: {defaultValue: 0.0}},
                {projectPlannedRevenue: {defaultValue: 0.0}},
                {projectTeamSize: {defaultValue: 0.0}},
                {riskValue: {defaultValue: 0.0}},
                {projectRevenueRatio: {defaultValue: 0.0}},
                {riskCount: {defaultValue: 0.0}},
                {revenueToCostRatio: {defaultValue: 0.0}}
            ]
        },
        Risk: {
            label: 'Risk',
            properties: [
                {riskDuration: {defaultValue: 0.0}},
                {projectDuration: {defaultValue: 0.0}},
                {projectOverallSlippagePercent: {defaultValue: 0.0}},
                {totalRiskValue: {defaultValue: 0.0}},
                {projectCostRatio: {defaultValue: 0.0}},
                {avgRiskValue: {defaultValue: 0.0}},
                {projectPlannedRevenue: {defaultValue: 0.0}},
                {revenueToCostRatio: {defaultValue: 0.0}},
                {projectTeamSize: {defaultValue: 0.0}},
                {riskValue: {defaultValue: 0.0}},
                {projectRevenueRatio: {defaultValue: 0.0}},
                {riskCount: {defaultValue: 0.0}}
            ]
        }
    },
    {
        HAS_RISK: {
            type: 'HAS_RISK',
            orientation: 'UNDIRECTED'
        }
    }
);

//.5. Stream GraphSage
CALL gds.beta.graphSage.stream(
    'myGraph',
    {
        modelName: 'SageModel'
    }
) YIELD nodeId, embedding
RETURN nodeId, embedding
LIMIT 10;

//.6.Mutate GraphSage
CALL gds.beta.graphSage.mutate(
  'myGraph',
  {
    mutateProperty: 'inMemoryEmbedding',
    modelName: 'SageModel'
  }
) YIELD
  nodeCount,
  nodePropertiesWritten

//.7. create a pipeline
CALL gds.beta.pipeline.linkPrediction.create('pipe')

//.8.Pipeline addNodeProperty
CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  randomSeed: 42
})

//.9. Pipeline addFeature
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding']
}) YIELD featureSteps 

// Add more feature generation methods
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'l2', {
  nodeProperties: ['embedding']
}) YIELD featureSteps 

// Add cosine feature with nodeProperties configuration
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'cosine', {
  nodeProperties: ['embedding']
}) YIELD featureSteps 

//.10. Pipeline ConfigureSplit
CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.25,
  trainFraction: 0.6,
  validationFolds: 3
})
YIELD splitConfig

//.11. Pipeline addLogisticRegression
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe')
YIELD parameterSpace

//.12. Pipeline addRandomForest
CALL gds.beta.pipeline.linkPrediction.addRandomForest('pipe', {
  numberOfDecisionTrees: 50,
  maxDepth: 20,
  minSplitSize: 10,
  minLeafSize: 5
})
YIELD parameterSpace

//.13. Pipeline configureAutoTuning
CALL gds.alpha.pipeline.linkPrediction.configureAutoTuning('pipe', {
  maxTrials: 10  // Increased number of trials
}) YIELD autoTuningConfig

//.14. Pipeline train linkPrediction
CALL gds.beta.pipeline.linkPrediction.train('myGraph', {
  pipeline: 'pipe',
  modelName: 'pipe-model',
  metrics: ['AUCPR', 'OUT_OF_BAG_ERROR'],
  targetRelationshipType: 'HAS_RISK',
  randomSeed: 12
}) YIELD modelInfo, modelSelectionStats
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
  modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
  modelInfo.metrics.AUCPR.test AS testScore,
  [cand IN modelSelectionStats.modelCandidates | cand.metrics.AUCPR.validation.avg] AS validationScores;

//.15. Pipeline stream 
CALL gds.beta.pipeline.linkPrediction.predict.stream('myGraph', {
  modelName: 'pipe-model',
  topN: 100,
  threshold: 0.3
})
YIELD node1, node2, probability
MATCH (p1:Project), (p2:Risk)
WHERE id(p1) = node1 AND id(p2) = node2
RETURN p1.projectNumber AS projectNumber, p2.riskDescription AS riskDescription, probability
ORDER BY probability DESC
LIMIT 25