Thank you Adam! @adam_schill-col @florentin_dorre
Now I have another problem with my project. Somehow when I stream my pipeline, I am getting all same probabilities. Do you have an idea why and what can I do with that?
projectNumber |
riskDescription |
probability |
32PO-41309 |
zusatzaufwand ib |
0.4999999999999998 |
32PO-41309 |
test |
0.4999999999999998 |
32PO-41309 |
100000 |
0.4999999999999998 |
P.01665 |
knm nacht und wochenendeinsatz |
0.4999999999999998 |
P.01665 |
mgliches risiko |
0.4999999999999998 |
P.01665 |
rc projekleiterwechsel |
0.4999999999999998 |
32PO-41309 |
material missing |
0.4999999999999998 |
P.01665 |
steigende materialpreise fr rohstoffe |
0.4999999999999998 |
... |
|
|
my script:
//.1. Load CSV and create all nodes and relationships
LOAD CSV WITH HEADERS FROM 'file:///LabeledFilteredOutput.csv' AS row
// Create Project Nodes
MERGE (p:Project {projectNumber: row.`Project Number`})
ON CREATE SET p.projectNumber = row.`Project Number`,
p.projectActualRevenue = coalesce(toFloat(row.`Project Actual Revenue`), 0.0),
p.projectPlannedRevenue = coalesce(toFloat(row.`Project Planned Revenue`), 0.0),
p.projectRevenueRatio = coalesce(toFloat(row.`(%) Actual Project Revenue / Planned Project Revenue`), 0.0),
p.projectBusinessSegment = row.`Business Segment of a Project`,
p.projectCustomerRef = row.`Project Customer Ref`,
p.projectOverallSlippage = coalesce(toFloat(row.`Project Overall Slippage`), 0.0),
p.projectOverallSlippagePercent = coalesce(toFloat(row.`% Project Overall Slippage`), 0.0),
p.projectCostRatio = coalesce(toFloat(row.`(%) Actual Project Cost / Planned Project Cost`), 0.0),
p.projectTeamSize = coalesce(toFloat(row.`Project Team Size`), 0.0),
p.projectFinishDate = date(row.`Project Finish Date`),
p.projectCreatedOn = date(row.`Project Created On`),
p.projectModifiedOn = date(row.`Project Modified On`),
p.projectExecutionSlippage = coalesce(toFloat(row.`Project Execution Slippage`), 0.0),
p.projectExecutionSlippagePercent = coalesce(toFloat(row.`% Project Execution Slippage`), 0.0),
p.projectWarrantySlippage = coalesce(toFloat(row.`Project Warranty Slippage`), 0.0),
p.projectWarrantySlippagePercent = coalesce(toFloat(row.`% Project Warranty Slippage`), 0.0)
// Create and link Risk Nodes with Risk Type, Status, and Label
MERGE (r:Risk {Risk: row.Risk})
ON CREATE SET r.riskDescription = row.`Risk Description`,
r.riskType = row.`Risk Type`,
r.riskCreatedOn = date(row.`Risk Created On`),
r.riskModifiedOn = date(row.`Risk Modified On`),
r.riskInitialValue = coalesce(toFloat(row.`Risk Initial Value`), 0.0),
r.riskValue = coalesce(toFloat(row.`Risk Value`), 0.0),
r.riskPcaId = row.`Risk PCA ID`,
r.riskLabel = row.`Label`,
r.riskDIStatus = row.`Risk DI Status`,
r.riskStatus = row.`Risk Status`
// Link Risk to RiskLabel if label is not null
//FOREACH (ignoreMe IN CASE WHEN row.`Label` IS NOT NULL THEN [1] ELSE [] END |
// FOREACH (label IN split(row.`Label`, ';') | // Assuming labels are separated by ';'
// MERGE (l:RiskLabel {name: label})
// MERGE (r)-[:HAS_LABEL]->(l)
// )
//)
MERGE (p)-[:HAS_RISK]->(r);
//.2. Feature engineering
// Temporal Features: Calculate risk duration, project duration, and time to project finish
MATCH (p:Project)-[:HAS_RISK]->(r:Risk)
SET r.riskDuration = coalesce(duration.inDays(date(r.riskCreatedOn), date(r.riskModifiedOn)).days, 0),
p.projectDuration = coalesce(duration.inDays(date(p.projectCreatedOn), date(p.projectFinishDate)).days, 0),
r.timeToProjectFinish = coalesce(duration.inDays(date(r.riskCreatedOn), date(p.projectFinishDate)).days, 0);
// Risk Value Change: Calculate the change in risk value from initial to current
MATCH (r:Risk)
SET r.riskValueChange = coalesce(r.riskValue - r.riskInitialValue, 0.0);
// Aggregated Risk Metrics: Calculate total, average, and count of risk values per project
MATCH (p:Project)-[:HAS_RISK]->(r:Risk)
WITH p, SUM(coalesce(r.riskValue, 0.0)) AS totalRiskValue, AVG(coalesce(r.riskValue, 0.0)) AS avgRiskValue, COUNT(r) AS riskCount
SET p.totalRiskValue = coalesce(totalRiskValue, 0.0),
p.avgRiskValue = coalesce(avgRiskValue, 0.0),
p.riskCount = coalesce(riskCount, 0);
// Revenue and Cost Ratios: Calculate the ratio of actual revenue to planned costs
MATCH (p:Project)
SET p.revenueToCostRatio = coalesce(p.projectActualRevenue / p.projectPlannedRevenue, 0.0);
//.3. Project myGraph
CALL gds.graph.project(
'myGraph',
{
Project: {
label: 'Project',
properties: [
{riskDuration: {defaultValue: 0.0}},
{projectDuration: {defaultValue: 0.0}},
{projectOverallSlippagePercent: {defaultValue: 0.0}},
{projectCostRatio: {defaultValue: 0.0}},
{avgRiskValue: {defaultValue: 0.0}},
{projectPlannedRevenue: {defaultValue: 0.0}},
{projectTeamSize: {defaultValue: 0.0}},
{riskValue: {defaultValue: 0.0}},
{projectRevenueRatio: {defaultValue: 0.0}},
{riskCount: {defaultValue: 0.0}},
{revenueToCostRatio: {defaultValue: 0.0}}
]
},
Risk: {
label: 'Risk',
properties: [
{riskDuration: {defaultValue: 0.0}},
{projectDuration: {defaultValue: 0.0}},
{projectOverallSlippagePercent: {defaultValue: 0.0}},
{totalRiskValue: {defaultValue: 0.0}},
{projectCostRatio: {defaultValue: 0.0}},
{avgRiskValue: {defaultValue: 0.0}},
{projectPlannedRevenue: {defaultValue: 0.0}},
{revenueToCostRatio: {defaultValue: 0.0}},
{projectTeamSize: {defaultValue: 0.0}},
{riskValue: {defaultValue: 0.0}},
{projectRevenueRatio: {defaultValue: 0.0}},
{riskCount: {defaultValue: 0.0}}
]
}
},
{
HAS_RISK: {
type: 'HAS_RISK',
orientation: 'UNDIRECTED'
}
}
);
//.5. Stream GraphSage
CALL gds.beta.graphSage.stream(
'myGraph',
{
modelName: 'SageModel'
}
) YIELD nodeId, embedding
RETURN nodeId, embedding
LIMIT 10;
//.6.Mutate GraphSage
CALL gds.beta.graphSage.mutate(
'myGraph',
{
mutateProperty: 'inMemoryEmbedding',
modelName: 'SageModel'
}
) YIELD
nodeCount,
nodePropertiesWritten
//.7. create a pipeline
CALL gds.beta.pipeline.linkPrediction.create('pipe')
//.8.Pipeline addNodeProperty
CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
mutateProperty: 'embedding',
embeddingDimension: 256,
randomSeed: 42
})
//.9. Pipeline addFeature
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
nodeProperties: ['embedding']
}) YIELD featureSteps
// Add more feature generation methods
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'l2', {
nodeProperties: ['embedding']
}) YIELD featureSteps
// Add cosine feature with nodeProperties configuration
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'cosine', {
nodeProperties: ['embedding']
}) YIELD featureSteps
//.10. Pipeline ConfigureSplit
CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {
testFraction: 0.25,
trainFraction: 0.6,
validationFolds: 3
})
YIELD splitConfig
//.11. Pipeline addLogisticRegression
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe')
YIELD parameterSpace
//.12. Pipeline addRandomForest
CALL gds.beta.pipeline.linkPrediction.addRandomForest('pipe', {
numberOfDecisionTrees: 50,
maxDepth: 20,
minSplitSize: 10,
minLeafSize: 5
})
YIELD parameterSpace
//.13. Pipeline configureAutoTuning
CALL gds.alpha.pipeline.linkPrediction.configureAutoTuning('pipe', {
maxTrials: 10 // Increased number of trials
}) YIELD autoTuningConfig
//.14. Pipeline train linkPrediction
CALL gds.beta.pipeline.linkPrediction.train('myGraph', {
pipeline: 'pipe',
modelName: 'pipe-model',
metrics: ['AUCPR', 'OUT_OF_BAG_ERROR'],
targetRelationshipType: 'HAS_RISK',
randomSeed: 12
}) YIELD modelInfo, modelSelectionStats
RETURN
modelInfo.bestParameters AS winningModel,
modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
modelInfo.metrics.AUCPR.test AS testScore,
[cand IN modelSelectionStats.modelCandidates | cand.metrics.AUCPR.validation.avg] AS validationScores;
//.15. Pipeline stream
CALL gds.beta.pipeline.linkPrediction.predict.stream('myGraph', {
modelName: 'pipe-model',
topN: 100,
threshold: 0.3
})
YIELD node1, node2, probability
MATCH (p1:Project), (p2:Risk)
WHERE id(p1) = node1 AND id(p2) = node2
RETURN p1.projectNumber AS projectNumber, p2.riskDescription AS riskDescription, probability
ORDER BY probability DESC
LIMIT 25