def generate_caption(image):
initial_state = gru.get_initial_state(image)
for _ in range(max_caption_length):
input_vector = token_to_vector(input_token)
features, state = gru(input_vector, initial_state)
attention = attention_layer([features, encoder_output])
output = output_layer(attention)
predicted_token = vector_to_token(output)
caption.append(predicted_token)
if predicted_token == '<end>':
input_token = predicted_token
return ' '.join(caption)