Problems with complex hierarchic query in Cyper


(Weidinger Christoph) #1

Dear all,

i have a problem writing a cypher query to get the correct result for a hierarchic result.

Here you can see the initial ERD from my SQL database:

I already wrote all needed cypher statements to load data, set index and create relationships. (see at the end of the ticket).

No i want to solve the following problem:

"I want to create a social netowrk for actors. All actors which play together already know each other. The system should provide a list of new contact suggestions that provides actor information from actor that know another actor over exactly one other actor. For having a detailed result the result should be given for actor "NICK WAHLBERG" and should only contain the first 13 film(FILM_ID <14).

In SQL i wrote the following query:
grafik

Is someone able to help me creating a cypher query to get the same result?
Thank you very much in advance

Here you can find all statements to create the graph:

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/payment.csv" AS row
CREATE (n:PAYMENT)
SET n = row
, n.AMOUNT = toFloat(row.AMOUNT)
, n.PAYMENT_DATE = row.PAYMENT_DATE
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/customer.csv" AS row
CREATE (n:CUSTOMER)
SET n = row
, n.FIRST_NAME = row.FIRST_NAME
, n.LAST_NAME = row.LAST_NAME
, n.EMAIL = row.EMAIL
, n.ACTIVE = row.ACTIVE
, n.CREATE_DATE = row.CREATE_DATE
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/staff.csv" AS row
CREATE (n:STAFF)
SET n = row
, n.FIRST_NAME = row.FIRST_NAME
, n.LAST_NAME = row.LAST_NAME
, n.PICTURE = row.PICTURE
, n.EMAIL = row.EMAIL
, n.ACTIVE = row.ACTIVE
, n.USERNAME = row.USERNAME
, n.PASSWORD = row.PASSWORD
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/address.csv" AS row
CREATE (n:ADDRESS)
SET n = row
, n.ADDRESS = row.ADDRESS
, n.ADDRESS2 = row.ADDRESS2
, n.DISTRICT = row.DISTRICT
, n.POSTAL_CODE = row.POSTAL_CODE
, n.PHONE = row.PHONE
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/city.csv" AS row
CREATE (n:CITY)
SET n = row
, n.CITY = row.CITY
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/country.csv" AS row
CREATE (n:COUNTRY)
SET n = row
, n.COUNTRY = row.COUNTRY
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/store.csv" AS row
CREATE (n:STORE)
SET n = row
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/rental.csv" AS row
CREATE (n:RENTAL)
SET n = row
, n.RENTAL_DATE = row.RENTAL_DATE
, n.RETURN_DATE = row.RETURN_DATE
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/inventory.csv" AS row
CREATE (n:INVENTORY)
SET n = row
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/film.csv" AS row
CREATE (n:FILM)
SET n = row
, n.TITLE = row.TITLE
, n.DESCRIPTION = row.DESCRIPTION
, n.RELEASE_YEAR = row.RELEASE_YEAR
, n.RENTAL_DURATION = row.RENTAL_DURATION
, n.RENTAL_RATE = row.RENTAL_RATE
, n.LENGTH = row.LENGTH
, n.REPLACEMENT_COST = row.REPLACEMENT_COST
, n.RATING = row.RATING
, n.SPECIAL_FEATURES = row.SPECIAL_FEATURES
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/language.csv" AS row
CREATE (n:LANGUAGE)
SET n = row
, n.NAME = row.NAME
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/actor.csv" AS row
CREATE (n:ACTOR)
SET n = row
, n.FIRST_NAME = row.FIRST_NAME
, n.LAST_NAME = row.LAST_NAME
, n.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/category.csv" AS row
CREATE (n:CATEGORY)
SET n = row
, n.NAME = row.NAME
, n.LAST_UPDATE = row.LAST_UPDATE;

CREATE INDEX ON :PAYMENT(PAYMENT_ID);
CREATE INDEX ON :CUSTOMER(CUSTOMER_ID);
CREATE INDEX ON :STAFF(STAFF_ID);
CREATE INDEX ON :ADDRESS(ADDRESS_ID);
CREATE INDEX ON :CITY(CITY_ID);
CREATE INDEX ON :COUNTRY(COUNTRY_ID);
CREATE INDEX ON :STORE(STORE_ID);
CREATE INDEX ON :RENTAL(RENTAL_ID);
CREATE INDEX ON :INVENTORY(INVENTORY_ID);
CREATE INDEX ON :FILM(FILM_ID);
CREATE INDEX ON :LANGUAGE(LANGUAGE_ID);
CREATE INDEX ON :ACTOR(ACTOR_ID);
CREATE INDEX ON :CATEGORY(CATEGORY_ID);

MATCH (p:PAYMENT), (c:CUSTOMER)
WHERE p.CUSTOMER_ID = c.CUSTOMER_ID
CREATE (p)-[:PAYED_FROM]->(c);

MATCH (p:PAYMENT), (s:STAFF)
WHERE p.STAFF_ID = s.STAFF_ID
CREATE (p)-[:RECEIVED_BY]->(s);

MATCH (p:PAYMENT), (r:RENTAL)
WHERE p.RENTAL_ID = r.RENTAL_ID
CREATE (p)-[:PAYED_FOR]->(r);

MATCH (c:CUSTOMER), (s:STORE)
WHERE c.STORE_ID = s.STORE_ID
CREATE (c)-[:WORKS_AT]->(s);

MATCH (c:CUSTOMER), (a:ADDRESS)
WHERE c.ADDRESS_ID = a.ADDRESS_ID
CREATE (c)-[:LIVES_AT]->(a);

MATCH (s:STAFF), (s:STORE)
WHERE s.STORE_ID = s.STORE_ID
CREATE (s)-[:STA_WORKS_AT]->(s);

MATCH (s:STAFF), (a:ADDRESS)
WHERE s.ADDRESS_ID = a.ADDRESS_ID
CREATE (s)-[:STA_LIVES_AT]->(a);

MATCH (c:CITY), (a:ADDRESS)
WHERE c.CITY_ID = a.CITY_ID
CREATE (a)-[:LOCATED_IN]->(c);

MATCH (c:CITY), (co:COUNTRY)
WHERE c.COUNTRY_ID = co.COUNTRY_ID
CREATE (c)-[:PART_OF]->(co);

MATCH (s:STORE), (st:STAFF)
WHERE s.MANAGER_STAFF_ID = st.STAFF_ID
CREATE (s)-[:MANAGED_BY]->(st);

MATCH (S:STORE), (a:ADDRESS)
WHERE S.ADDRESS_ID = a.ADDRESS_ID
CREATE (S)-[:LOCATED_AT]->(a);

MATCH (r:RENTAL), (i:INVENTORY)
WHERE r.INVENTORY_ID = i.INVENTORY_ID
CREATE (r)-[:INVENTORIED]->(i);

MATCH (r:RENTAL), (c:CUSTOMER)
WHERE r.CUSTOMER_ID = c.CUSTOMER_ID
CREATE (r)-[:RENTED_FROM]->(c);

MATCH (r:RENTAL), (s:STAFF)
WHERE r.STAFF_ID = s.STAFF_ID
CREATE (r)-[:RENTED_BY]->(s);
	
MATCH (i:INVENTORY), (f:FILM)
WHERE i.FILM_ID = f.FILM_ID
CREATE (i)-[:CONTAINS]->(f);

MATCH (i:INVENTORY), (s:STORE)
WHERE i.STORE_ID = s.STORE_ID
CREATE (i)-[:INVENTORIED_AT]->(s);
	
MATCH (f:FILM), (l:LANGUAGE)
WHERE  f.LANGUAGE_ID = l.LANGUAGE_ID
CREATE (f)-[:HAS_LANGUAGE]->(l);

MATCH (f:FILM), (l:LANGUAGE)
WHERE f.ORIGINAL_LANGUAGE_ID = l.LANGUAGE_ID
CREATE (f)-[:HAS_ORIG_LANGUAGE]->(l);

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/film_actor.csv" AS row
MATCH (a:ACTOR), (f:FILM)
WHERE a.ACTOR_ID = row.ACTOR_ID AND f.FILM_ID = row.FILM_ID
CREATE (a)-[details:ACTS_IN]->(f)
SET details = row
,  details.LAST_UPDATE = row.LAST_UPDATE;

LOAD CSV WITH HEADERS FROM "http://www.fimojules.at/data/film_category.csv" AS row
MATCH (c:CATEGORY), (f:FILM)
WHERE c.CATEGORY_ID = row.CATEGORY_ID AND f.FILM_ID = row.FILM_ID
CREATE (f)-[details:HAS_CATEGORY]->(c)
SET details = row
,  details.LAST_UPDATE = row.LAST_UPDATE;


(Weidinger Christoph) #2

At the moment i tried the following query:

MATCH (a:ACTOR{LAST_NAME:"WAHLBERG"})-[:ACTS_IN]->(m1)<-[:ACTS_IN]-(actor2)
WITH DISTINCT actor2
MATCH (actor2)-[:ACTS_IN*1..1]->(m2)
RETURN distinct actor2.FIRST_NAME,actor2.LAST_NAME,  m2.FILM_ID
ORDER BY m2.FILM_ID ASC

Are there some parts which are correct?


(Mark Needham) #3

I'm not sure that I understand exactly what you want to do but I'll try to help....so this bit:

MATCH (a:ACTOR{LAST_NAME:"WAHLBERG"})-[:ACTS_IN]->(m1)<-[:ACTS_IN]-(actor2)
WITH DISTINCT actor2

Finds all the actors that have worked with Wahlberg. And then do you want to find other actors that they've acted with that Wahlberg hasn't worked with? If so then this should do it:

MATCH (a:ACTOR{LAST_NAME:"WAHLBERG"})-[:ACTS_IN]->(m1)<-[:ACTS_IN]-(actor2)
WITH DISTINCT a, actor2
MATCH (actor2)-[:ACTS_IN]->(m2)<-[:ACTS_IN]-(reco)
WHERE not((reco)-[:ACTS_IN]->()<-[:ACTS_IN]-(a))
RETURN reco, collect(m2) AS movies

(Andrew Bowman) #4

Hi there, one other optimization you could do is collect the co-actors so filtering at the end (to exclude co-actors from the result set) becomes less expensive:

MATCH (a:ACTOR{LAST_NAME:"WAHLBERG"})-[:ACTS_IN*2]-(coactor) 
WITH a, collect(DISTINCT coactor) as coactors
UNWIND coactors as coactor
MATCH (coactor)-[:ACTS_IN*2]-(reco) 
WHERE a <> reco
WITH DISTINCT reco, coactors
WHERE NOT reco IN coactors
RETURN reco.FIRST_NAME, reco.LAST_NAME

This gets you the names of actors that have co-acted with co-actors of WAHLBERG, but aren't coactors of WAHLBERG.

Note that this isn't addressing some parts of the problem you're trying to solve: "from actor that know another actor over exactly one other actor", you would need to break out of the efficiencies of this query, counting the number of connections to the recommended actor and filtering to only those that have a single path to them (only known through a single co-actor). You would also need to apply the restriction on the films used to find coactors: " should only contain the first 13 film(FILM_ID < 14)"

See if you can modify the above query to work with those additional restrictions.


(Weidinger Christoph) #5

Hi Mark,

thank you very much for your response. I analysed your query and understood it.
Nevertheless it seems that the query does not provide the correct result compared to my SQL result.

grafik

Sorry if i wrote my problem a bit confusing: "In case WAHLBERG works with SMITH and SMITH works with THOMSON and MONROE in other films, i want to provide a list with people WAHLBERG has no direct contact. Therefore THOMSON and MONROE should be on this list.

Do you know whats my problem?


(Weidinger Christoph) #6

Hi Andrew,

thank you very much for providing me your query. I have problems running this query without any issues:

Do you know what could be wrong?
Thank you in advance


(Andrew Bowman) #7

My fault there, forgot to include coactors in my WITH clause. I edited my query above to fix it.


(Ameyasoft) #8

Hi,

Here is my solution.

  1. Find actors that worked with Nick (ID = 2) and film ids < 14:

MATCH (a:Actor)-[:ACTS_IN]-> (f:Film) <-[:ACTS_IN]-(co:Actor)
WHERE a.id = 2 and f.id < 14
RETURN a, f, co;

  1. Find the other films that coactors worked:

MATCH (a:Actor)-[:ACTS_IN]-> (f:Film) <-[:ACTS_IN]-(co:Actor)
WHERE a.id = 2 and f.id < 14
WITH a, COLLECT(co) as actrs, f
UNWIND actrs as n
MATCH (n)-[:ACTS_IN]-> (f1:Film)
WHERE f1.id < 14 AND f1 <> f
WITH f1
MATCH (n1)-[:ACTS_IN]-> (f1)
RETURN n1, f1;
remote

  1. Final result of actors names:

MATCH (a:Actor)-[:ACTS_IN]-> (f:Film) <-[:ACTS_IN]-(co:Actor)
WHERE a.id = 2 and f.id < 14
WITH a, COLLECT(co) as actrs, f
UNWIND actrs as n
MATCH (n)-[:ACTS_IN]-> (f1:Film)
WHERE f1.id < 14 AND f1 <> f
WITH f1
MATCH (n1)-[:ACTS_IN]-> (f1)
WITH DISTINCT n1
RETURN n1.id as ID, n1.first as FirstName, n1.last as LastName ORDER BY ID;

-Kamal


(Weidinger Christoph) #9

Hi Andrew,

thank you very much. Nevertheless there is a mismatch between Cypher and SQL results.
Do you have any idea, what still could be different?


(Andrew Bowman) #10

I mentioned this after the query:

This gets you the names of actors that have co-acted with co-actors of WAHLBERG, but aren't coactors of WAHLBERG.

Note that this isn't addressing some parts of the problem you're trying to solve: "from actor that know another actor over exactly one other actor", you would need to break out of the efficiencies of this query, counting the number of connections to the recommended actor and filtering to only those that have a single path to them (only known through a single co-actor). You would also need to apply the restriction on the films used to find coactors: " should only contain the first 13 film(FILM_ID < 14)"

See if you can modify the above query to work with those additional restrictions.

Are all these conditions still applicable? Have you tried to modify my query to include them? The idea by providing this was to show an approach that you could reuse and modify.


(Ameyasoft) #11

Hi,

Hope the solution that I submitted ( see 6/10 message) will work for you. Let me know.

Thanks,
-Kamal