节点回归与子图和图样本投影
此 Jupyter Notebook 托管在 Neo4j 图数据科学客户端 Github 存储库中的此处。
有关此笔记本的视频演示,请参阅在 NODES 2022 大会上发表的演讲Neo4j 图数据科学系列 2.x 基础 - 管道及更多。
该笔记本举例说明了如何使用节点回归管道。它还包含许多使用以下内容的示例:
-
便捷对象
-
子图投影
-
图样本投影
它用纯 Python 编写,以展示 GDS Python 客户端抽象 Cypher 查询的能力。
1. 数据集
我们的输入图表示特定主题的维基百科页面,以及它们如何相互链接
-
变色龙
-
松鼠
-
鳄鱼
特征是页面文本中某些信息名词的存在。目标是页面的每月平均流量。
该数据集最初发表在 B. Rozemberczki、C. Allen 和 R. Sarkar 的《多尺度属性节点嵌入》中,eprint 1909.13021。此处托管的版本取自 2022 年 11 月 14 日的SNAP。
2. 先决条件
要运行此管道,您必须具备以下条件:
-
运行 Neo4j DBMS,其中包含:
-
已安装最新版本的图数据科学
-
已安装最新版本的 APOC
-
如果您有一个 AuraDS 实例处于活动状态并正在运行,则可以满足这些要求。
# First, we must install the GDS Python Client
%pip install graphdatascience
import os
# Then, we connect to our Neo4j DBMS hosting the Graph Data Science library
from graphdatascience import GraphDataScience
# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://#:7687")
NEO4J_AUTH = None
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"):
NEO4J_AUTH = (
os.environ.get("NEO4J_USER"),
os.environ.get("NEO4J_PASSWORD"),
)
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)
# Test our connection and print the Graph Data Science library version
print(gds.version())
from graphdatascience.server_version.server_version import ServerVersion
assert gds.server_version() >= ServerVersion(2, 1, 0)
# Importing the dataset
# The dataset is sourced from this GitHub repository
baseUrl = (
"https://raw.githubusercontent.com/neo4j/graph-data-science-client/main/examples/datasets/wikipedia-animals-pages"
)
# Constraints to speed up importing
gds.run_cypher(
"""
CREATE CONSTRAINT chameleons
FOR (c:Chameleon)
REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
"""
CREATE CONSTRAINT crocodiles
FOR (c:Crocodile)
REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
"""
CREATE CONSTRAINT squirrels
FOR (s:Squirrel)
REQUIRE s.id IS NODE KEY
"""
)
# Create nodes and relationships
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_edges.csv' AS row
MERGE (c1:Chameleon {id: row.id1})
MERGE (c2:Chameleon {id: row.id2})
MERGE (c1)-[:LINK]->(c2)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_edges.csv' AS row
MERGE (c1:Crocodile {id: row.id1})
MERGE (c2:Crocodile {id: row.id2})
MERGE (c1)-[:LINK]->(c2)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_edges.csv' AS row
MERGE (s1:Squirrel {id: row.id1})
MERGE (s2:Squirrel {id: row.id2})
MERGE (s1)-[:LINK]->(s2)
""",
{"baseUrl": baseUrl},
)
# Create target properties
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_target.csv' AS row
MATCH (c:Chameleon {id: row.id})
SET c.target = toInteger(row.target)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_target.csv' AS row
MATCH (c:Crocodile {id: row.id})
SET c.target = toInteger(row.target)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_target.csv' AS row
MATCH (s:Squirrel {id: row.id})
SET s.target = toInteger(row.target)
""",
{"baseUrl": baseUrl},
)
# Create feature vectors
gds.run_cypher(
"""
CALL apoc.load.json($baseUrl + '/chameleon/musae_chameleon_features.json') YIELD value
WITH value, keys(value) AS keys
UNWIND keys AS key
WITH value[key] AS feature, key
MATCH (c:Chameleon {id: key})
SET c.features = feature
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
CALL apoc.load.json($baseUrl + '/crocodile/musae_crocodile_features.json') YIELD value
WITH value, keys(value) AS keys
UNWIND keys AS key
WITH value[key] AS feature, key
MATCH (c:Crocodile {id: key})
SET c.features = feature
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
CALL apoc.load.json($baseUrl + '/squirrel/musae_squirrel_features.json') YIELD value
WITH value, keys(value) AS keys
UNWIND keys AS key
WITH value[key] AS feature, key
MATCH (c:Squirrel {id: key})
SET c.features = feature
""",
{"baseUrl": baseUrl},
)
3. 为管道准备数据集
为了使用数据集,我们必须以模型支持且可以很好地使用的格式准备特征。在其原始形式中,特征是特定单词的 ID,因此不适合作为线性回归的输入。
为了克服这一点,我们将使用独热编码。这将生成适合线性回归的特征。我们首先学习节点集上名词的字典。我们创建一个节点来托管字典,然后使用它对所有特征向量进行独热编码。
# Construct one-hot dictionaries
gds.run_cypher(
"""
MATCH (s:Chameleon)
WITH s.features AS features
UNWIND features AS feature
WITH feature
ORDER BY feature ASC
WITH collect(distinct feature) AS orderedTotality
CREATE (:Feature {animal: 'chameleon', totality: orderedTotality})
RETURN orderedTotality
"""
)
gds.run_cypher(
"""
MATCH (s:Crocodile)
WITH s.features AS features
UNWIND features AS feature
WITH feature
ORDER BY feature ASC
WITH collect(distinct feature) AS orderedTotality
CREATE (:Feature {animal: 'crocodile', totality: orderedTotality})
RETURN orderedTotality
"""
)
gds.run_cypher(
"""
MATCH (s:Squirrel)
WITH s.features AS features
UNWIND features AS feature
WITH feature
ORDER BY feature ASC
WITH collect(distinct feature) AS orderedTotality
CREATE (:Feature {animal: 'squirrel', totality: orderedTotality})
RETURN orderedTotality
"""
)
# Do one-hot encoding
gds.run_cypher(
"""
MATCH (f:Feature {animal: 'chameleon'})
MATCH (c:Chameleon)
SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
"""
MATCH (f:Feature {animal: 'crocodile'})
MATCH (c:Crocodile)
SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
"""
MATCH (f:Feature {animal: 'squirrel'})
MATCH (c:Squirrel)
SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
# First, let's project our graph into the GDS Graph Catalog
# We will use a native projection to begin with
G_animals, projection_result = gds.graph.project(
"wiki_animals",
["Chameleon", "Squirrel", "Crocodile"],
{"LINK": {"orientation": "UNDIRECTED"}},
nodeProperties=["features_one_hot", "target"],
)
print(projection_result[["graphName", "nodeCount", "relationshipCount"]])
4. 连通性
在图分析中,通常只对连通图进行操作。也就是说,仅由一个组件组成的图。这样做的原因是在大多数情况下,信息在没有连接的地方不会流动。
确定图中组件数量最快的方法是使用 WCC(弱连通组件)算法。
# We use the WCC algorithm to see how many components we have
wcc_result = gds.wcc.mutate(G_animals, mutateProperty="wcc_component")
print(wcc_result[["computeMillis", "componentCount"]])
5. 组件分离
了解我们的图由三个组件组成后,接下来我们将组件分离成单独的图。我们将使用subgraph
投影来实现此目的。我们将为每个组件创建一个子图。
# First, we stream the component ids
components = gds.graph.nodeProperty.stream(G_animals, "wcc_component")
# Second, we compute the unique component ids
component_ids = components["propertyValue"].unique()
# Third, we project a subgraph for each component
component_graphs = [
gds.beta.graph.project.subgraph(
f"animals_component_{component_id}",
G_animals,
f"n.wcc_component = {component_id}",
"*",
)[0]
for component_id in component_ids
]
# Lastly, we map the node labels in the graphs to the graph
graph_components_by_labels = {str(G_component.node_labels()): G_component for G_component in component_graphs}
print({k: v.name() for k, v in graph_components_by_labels.items()})
# Now, we are only interested in the Chameleon graph,
# so we will drop the other graphs and define a better variable for the one we keep
graph_components_by_labels[str(["Crocodile"])].drop()
graph_components_by_labels[str(["Squirrel"])].drop()
G_chameleon = graph_components_by_labels[str(["Chameleon"])]
# With the graph object G_chameleon, we can inspect some statistics
print("#nodes: " + str(G_chameleon.node_count()))
print("#relationships: " + str(G_chameleon.relationship_count()))
print("Degree distribution")
print("=" * 25)
print(G_chameleon.degree_distribution().sort_index())
6. 现在,让我们构建一个训练管道!
我们将创建一个节点回归管道,然后:
-
配置拆分
-
添加模型候选
-
配置自动调整
-
添加节点属性步骤
-
选择模型特征
管道位于管道目录中,我们通过管道对象对其进行操作,以实现最大的便捷性。
# Now, let's construct a training pipeline!
chameleons_nr_training = gds.nr_pipe("node_regression_pipeline__Chameleons")
# We configure the splitting
chameleons_nr_training.configureSplit(validationFolds=5, testFraction=0.2)
# We add a set of model candidates
# A linear regression model with the learningRate parameter in a search space
chameleons_nr_training.addLinearRegression(
penalty=1e-5,
patience=3,
tolerance=1e-5,
minEpochs=20,
maxEpochs=500,
learningRate={"range": [100, 1000]}, # We let the auto-tuner find a good value
)
# Let's try a few different models
chameleons_nr_training.configureAutoTuning(maxTrials=10)
# Our input feature dimension is 3132
# We can reduce the dimension to speed up training using a FastRP node embedding
chameleons_nr_training.addNodeProperty(
"fastRP",
embeddingDimension=256,
propertyRatio=0.8,
featureProperties=["features_one_hot"],
mutateProperty="frp_embedding",
randomSeed=420,
)
# And finally we select what features the model should be using
# We rely on the FastRP embedding solely, because it encapsulates the one-hot encoded source features
chameleons_nr_training.selectFeatures("frp_embedding")
# The training pipeline is now fully configured and ready to be run!
# We use the training pipeline to train a model
nc_model, train_result = chameleons_nr_training.train(
G_chameleon, # First, we use the entire Chameleon graph
modelName="chameleon_nr_model",
targetNodeLabels=["Chameleon"],
targetProperty="target",
metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
randomSeed=420,
)
print("Winning model parameters: \n\t\t" + str(train_result["modelInfo"]["bestParameters"]))
print()
print("MEAN_SQUARED_ERROR test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"]))
print("MEAN_ABSOLUTE_ERROR test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"]))
# Let's sample the graph to see if we can get a similarly good model
G_chameleon_sample, _ = gds.alpha.graph.sample.rwr(
"cham_sample",
G_chameleon,
samplingRatio=0.30, # We'll use 30% of the graph
)
# Now we can use the same training pipeline to train another model, but faster!
nc_model_sample, train_result_sample = chameleons_nr_training.train(
G_chameleon_sample,
modelName="chameleon_nr_model_sample",
targetNodeLabels=["Chameleon"],
targetProperty="target",
metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
randomSeed=420,
)
print("Winning model parameters: \n\t\t" + str(train_result_sample["modelInfo"]["bestParameters"]))
print()
print(
"MEAN_SQUARED_ERROR test score: "
+ str(train_result_sample["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"])
)
print(
"MEAN_ABSOLUTE_ERROR test score: "
+ str(train_result_sample["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"])
)
# Let's see what our models predict
# The speed-trained model on 24% training data (30% sample - 20% test set)
predicted_targets_sample = nc_model_sample.predict_stream(G_chameleon)
# The fully trained model on 80% training data (20% test set)
predicted_targets_full = nc_model.predict_stream(G_chameleon)
# The original training data for comparison
real_targets = gds.graph.nodeProperty.stream(G_chameleon, "target")
# Merging the data frames
merged_full = real_targets.merge(predicted_targets_full, left_on="nodeId", right_on="nodeId")
merged_all = merged_full.merge(predicted_targets_sample, left_on="nodeId", right_on="nodeId")
# Look at the last 10 rows
print(merged_all.tail(10))