Hello Brian,
Thank you for your reply.
I have some python code to show the issue I am having.
The code below create 100 random x, y nodes in a Neo4j DB for training.
After training the pipeline the code adds another 10 nodes with x and expected_y values and gets the model to predict the y value.
When I run the code with a zero offset/intercept I get good results:
x:1.7572, expected_y: 17.5722, predicted value: 17.2145
x:25.6149, expected_y: 256.1489, predicted value: 255.7478
x:44.178, expected_y: 441.7801, predicted value: 441.3453
x:2.4678, expected_y: 24.6784, predicted value: 24.3194
x:6.0407, expected_y: 60.407, predicted value: 60.0414
x:11.9523, expected_y: 119.5231, predicted value: 119.1468
x:26.8498, expected_y: 268.4982, predicted value: 268.0949
x:45.6542, expected_y: 456.5424, predicted value: 456.1049
x:6.2703, expected_y: 62.7028, predicted value: 62.3369
When I run the code with an offset/intercept of 100 I get bad results:
x:1.7572, expected_y: 117.5722, predicted value: 74.2811
x:25.6149, expected_y: 356.1489, predicted value: 344.5669
x:44.178, expected_y: 541.7801, predicted value: 554.8702
x:2.4678, expected_y: 124.6784, predicted value: 82.3318
x:6.0407, expected_y: 160.407, predicted value: 122.809
x:11.9523, expected_y: 219.5231, predicted value: 189.7822
x:26.8498, expected_y: 368.4982, predicted value: 358.5575
x:45.6542, expected_y: 556.5424, predicted value: 571.5946
x:6.2703, expected_y: 162.7028, predicted value: 125.41
Here is the code. Simply swap lines 4 and 5 to get zero or 100 offset/intercept.
import random
from graphdatascience import GraphDataScience
OFFSET = 0 # using an offset (intercept) of zero gives good results
# OFFSET = 100 # using a non-zero offset (intercept) gives bad results
def main():
# connect to DB
gds = GraphDataScience(endpoint='neo4j://localhost:7687', auth=('neo4j', 'MyPassword'), database='neo4j')
assert gds is not None
# delete from previous run (if exists)
gds.graph.drop(graph='prodict-projection', failIfMissing=False)
gds.graph.drop(graph='training-projection', failIfMissing=False)
if gds.pipeline.exists('lr-pipe')["exists"]:
gds.pipeline.drop(gds.pipeline.get('lr-pipe'))
if gds.model.exists('predict-model')["exists"]:
gds.model.drop(gds.model.get('predict-model'))
gds.run_cypher('MATCH (n:TrainingData|PredictionData) DELETE n')
# create the pipeline
pipeline, _ = gds.alpha.pipeline.nodeRegression.create('lr-pipe')
pipeline.selectFeatures('x') # 'x' is my input feature/parameter
pipeline.addLinearRegression(minEpochs=(500, 1000), maxEpochs=(3000, 5000), learningRate=(0.01, 0.1), patience=(1, 10), tolerance=0.1)
# training data with properties x and y. x is randon positive number 0 to 50, y is x times 10 with a random variation of around 5 positive or negative
random.seed(3) # set randon seed allowing us to get the same random data each run
def create_random_x_y_value(): # create random (x, y, expected_y) value
x = random.random() * 50 # create a random x value in range 0 to 50
expected_y = (x * 10) + OFFSET # calculate our expected y value
variation = (random.random() - 0.5) * 10 # create a variation -5 to +5 to be applied to y
y = expected_y + variation # Calculate y based on expected y with variation
return [x, y, expected_y]
# create training data
training_numbers = [create_random_x_y_value() for _ in range(1, 100)] # create 100 x,y values
gds.run_cypher(f'UNWIND {training_numbers} AS t WITH t[0] AS x, t[1] AS y MERGE (n:TrainingData {{x: x, y: y}})') # add training data into Neo4j
# create the training data projection
training_projection, project_result = gds.graph.project(graph_name='training-projection', node_spec={'TrainingData': {'properties': ['x', 'y']}}, relationship_spec=['*'])
assert project_result.empty is False
# train the pipeline
predictor_model, train_result = pipeline.train(training_projection, modelName='predict-model', targetProperty='y', metrics=['MEAN_SQUARED_ERROR'], randomSeed=42)
assert train_result["trainMillis"] >= 0
# create data to run prediction on
prediction_numbers = [create_random_x_y_value() for _ in range(1, 10)] # create 10 x,expected_y values
gds.run_cypher(f'UNWIND {prediction_numbers} AS t WITH t[0] AS x, t[2] AS expected MERGE (n:PredictionData {{x: x, expected_y: expected}})')
# create prediction data projection
predict_projection, project_result = gds.graph.project(graph_name='prodict-projection', node_spec={'PredictionData': {'properties': ['x']}}, relationship_spec=['*'])
# predict using the model from the pipeline
predictions = predictor_model.predict_stream(predict_projection, concurrency=2)
# display predictions
for index, row in predictions.iterrows():
query_result = gds.run_cypher(f"MATCH (a) WHERE id(a) = {row['nodeId']} return a") # get original node values for x and expected_y
node = query_result.values[0][0]
print(f"x:{round(node['x'], 4)}, expected_y: {round(node['expected_y'], 4)}, predicted value: {round(row['predictedValue'], 4)}")
main()
I have tried with different settings etc but I just can't get any data with a non-zero intercept to work.
Any help would be greatly appreciated.
Thank you.