Dustin Coates in Alexa

ResponseBuilder.js: building responses with Echo Show template support

We previously looked at ResponseBuilder, so why are we looking at it again? Well, since this series started, the Alexa Skills Kit Node.js SDK version 1.0.12 has been released. This saw a big refactoring the ResponseBuilder, and, excitingly, more modern JS! You can find the new ResponseBuilder.js here.

As a reminder, this is the Dig Deep series, where we look line-by-line at the tools and libraries we use to build voice-first experiences. This is not the place to go for tutorials, but if you want to learn interesting little nuggets about what you use every day, off we go…

const createSSMLSpeechObject = (message) => {
  return {
    type: 'SSML',
    ssml: `<speak> ${message} </speak>`
  };
};

const buildCard = (cardTitle, cardContent, cardImage) => {
  let card = {
    type: CARD_TYPES.SIMPLE,
    title: cardTitle,
    content: cardContent
  };

  if(cardImage && (cardImage.smallImageUrl || cardImage.largeImageUrl)) {
    card.type = CARD_TYPES.STANDARD;
    card.image = {};

    delete card.content;
    card.text = cardContent;

    if(cardImage.smallImageUrl) {
      card.image.smallImageUrl = cardImage.smallImageUrl;
    }

    if(cardImage.largeImageUrl) {
      card.image.largeImageUrl = cardImage.largeImageUrl;
    }
  }

  return card;
};

const CARD_TYPES = {
  STANDARD : 'Standard',
  SIMPLE : 'Simple',
  LINK_ACCOUNT : 'LinkAccount'
};

const HINT_TYPES = {
  PLAIN_TEXT : 'PlainText'
};

const DIRECTIVE_TYPES = {
  AUDIOPLAYER : {
    PLAY : 'AudioPlayer.Play',
    STOP : 'AudioPlayer.Stop',
    CLEAR_QUEUE : 'AudioPlayer.ClearQueue'
  },
  DISPLAY : {
    RENDER_TEMPLATE : 'Display.RenderTemplate'
  },
  HINT : 'Hint',
  VIDEOAPP : {
    LAUNCH : 'VideoApp.Launch'
  }
};

class ResponseBuilder {
  constructor(alexaHandler) { // property : response
    this._responseObject = alexaHandler.response;
    this._responseObject.version = '1.0';
    this._responseObject.response = {
      shouldEndSession : true
    };

    this._responseObject.sessionAttributes = alexaHandler._event.session.attributes;
  }

  speak(speechOutput) {
    this._responseObject.response.outputSpeech = createSSMLSpeechObject(speechOutput);
    return this;
  }

  listen(repromptSpeech) {
    this._responseObject.response.reprompt = {
      outputSpeech: createSSMLSpeechObject(repromptSpeech)
    };
    this._responseObject.response.shouldEndSession = false;
    return this;
  }

  cardRenderer(cardTitle, cardContent, cardImage) {
    const card = buildCard(cardTitle, cardContent, cardImage);
    this._responseObject.response.card = card;
    return this;
  }

  linkAccountCard() {
    this._responseObject.response.card = {
      type: CARD_TYPES.LINK_ACCOUNT
    };
    return this;
  }

  audioPlayer(directiveType, behavior, url, token, expectedPreviousToken, offsetInMilliseconds) {
    if (directiveType === 'play') {
      return this.audioPlayerPlay(behavior, url, token, expectedPreviousToken, offsetInMilliseconds);
    } else if (directiveType === 'stop') {
      return this.audioPlayerStop();
    } else {
      return this.audioPlayerClearQueue(behavior);
    }
  }

  audioPlayerPlay(behavior, url, token, expectedPreviousToken, offsetInMilliseconds) {
    const audioPlayerDirective = {
      type : DIRECTIVE_TYPES.AUDIOPLAYER.PLAY,
      playBehavior: behavior,
      audioItem: {
        stream: {
          url: url,
          token: token,
          expectedPreviousToken: expectedPreviousToken,
          offsetInMilliseconds: offsetInMilliseconds
        }
      }
    };

    this._addDirective(audioPlayerDirective);
    return this;
  }

  audioPlayerStop() {
    const audioPlayerDirective = {
      'type': DIRECTIVE_TYPES.AUDIOPLAYER.STOP
    };

    this._addDirective(audioPlayerDirective);
    return this;
  }

  audioPlayerClearQueue(clearBehavior) {
    const audioPlayerDirective = {
      type : DIRECTIVE_TYPES.AUDIOPLAYER.CLEAR_QUEUE,
      clearBehavior : clearBehavior
    };

    this._addDirective(audioPlayerDirective);
    return this;
  }

  renderTemplate(template) {
    const templateDirective = {
      type : DIRECTIVE_TYPES.DISPLAY.RENDER_TEMPLATE,
      template : template
    };

    this._addDirective(templateDirective);
    return this;
  }

  hint(hintText, hintType) {
    if(!hintType) {
      hintType = HINT_TYPES.PLAIN_TEXT;
    }

    const hintDirective = {
      type : DIRECTIVE_TYPES.HINT,
      hint : {
        type : hintType,
        text : hintText
      }
    };

    this._addDirective(hintDirective);
    return this;
  }

  playVideo(source, metadata) {
    const playVideoDirective = {
      type : DIRECTIVE_TYPES.VIDEOAPP.LAUNCH,
      videoItem : {
        source : source
      }
    };

    if (playVideoDirective.videoItem.metadata) {
      playVideoDirective.videoItem.metadata = metadata;
    }

    // Note : shouldEndSession flag is not allowed with LaunchVideoApp.Launch Directive
    delete this._responseObject.response.shouldEndSession;
    this._addDirective(playVideoDirective);
    return this;
  }

  _addDirective(directive) {
    if(!Array.isArray(this._responseObject.response.directives)) {
      this._responseObject.response.directives = [];
    }

    this._responseObject.response.directives.push(directive);
  }
}

module.exports.ResponseBuilder = ResponseBuilder;
module.exports.CARD_TYPES = CARD_TYPES;
module.exports.DIRECTIVE_TYPES = DIRECTIVE_TYPES;
module.exports.HINT_TYPES = HINT_TYPES;

createSSSMLSpeechObject

const createSSMLSpeechObject = (message) => {
  return {
    type: 'SSML',
    ssml: `<speak> ${message} </speak>`
  };
};

Not much to say here, except that it returns an object with a type ('SSML') and the SSML itself, wrapped in a <speak> tag. Same as the last version (and much better than having to do it ourselves), except we’ve moved on to fat arrow functions. Get excited!

buildCard

const buildCard = (cardTitle, cardContent, cardImage) => {
  let card = {
    type: CARD_TYPES.SIMPLE,
    title: cardTitle,
    content: cardContent
  };

  if(cardImage && (cardImage.smallImageUrl || cardImage.largeImageUrl)) {
    card.type = CARD_TYPES.STANDARD;
    card.image = {};

    delete card.content;
    card.text = cardContent;

    if(cardImage.smallImageUrl) {
      card.image.smallImageUrl = cardImage.smallImageUrl;
    }

    if(cardImage.largeImageUrl) {
      card.image.largeImageUrl = cardImage.largeImageUrl;
    }
  }

  return card;
};


const CARD_TYPES = {
  STANDARD : 'Standard',
  SIMPLE : 'Simple',
  LINK_ACCOUNT : 'LinkAccount'
};

In the past, the logic to build a card was in each individual conditional. The Amazon team has abstracted a lot of that away. Every card has a type (either 'Standard' for cards with images, 'Simple' for cards with just text, or 'LinkAccount' to link an account on your side to an Alexa skill).

Simple Card

A simple card. For what it’s worth, I wouldn’t recommend a card like this. What value does it provide to the user?

Standard Card

A standard card with an image.

Account Linking Card

An account linking card.

Something to note is that the buildCard function does not build account-linking cards, which is done elsewhere.

Each card also has a title and body text, though this is content for a card without image and text for those with an image.

One take-away that I don’t think is easily skipped over in the SDK documentation, particularly when looking at the cardRenderer method is that the images come as part of an object, with largeImageUrl and smallImageUrl. If one or the other is not provided, Amazon will default to using both. If you supply just a small image, then it will appear blown up in large contexts. If you supply just a large image, it will take longer to load in small contexts.

buildCard is not available for us to use directly, but is used as part of cardRenderer, which we have at our disposal when building response objects.

ResponseBuilder

class ResponseBuilder {
  constructor(alexaHandler) { // property : response
    this._responseObject = alexaHandler.response;
    this._responseObject.version = '1.0';
    this._responseObject.response = {
      shouldEndSession : true
    };

    this._responseObject.sessionAttributes = alexaHandler._event.session.attributes;
  }
}

More modern JS! Love it or hate it, JavaScript now has classes and Amazon’s riding the wave.

The ResponseBuidler class is used in exactly one place: that’s inside RegisterHandlers to provide a response property on the handler object that’s built.

The constructor is setting up the response object, first by setting _responseObject to alexaHandler.response. At this point it is just an empty object. The version is set to 1.0 (Amazon seems fairly judicious with their version numbers, so I don’t foresee this really changing too often) and shouldEndSession is set to true. The session will be ended in all responses except for listen. And, finally, we get another way to access the session attributes via sessionAttributes.

speak and listen

speak(speechOutput) {
  this._responseObject.response.outputSpeech = createSSMLSpeechObject(speechOutput);
  return this;
}

listen(repromptSpeech) {
  this._responseObject.response.reprompt = {
    outputSpeech: createSSMLSpeechObject(repromptSpeech)
  };
  this._responseObject.response.shouldEndSession = false;
  return this;
}

The two simplest. Unless you’re not having Alexa say anything, you’ll likely always have speak, as it sets the speech that Alexa will perform to the user. Note that it’s setting the value of outputSpeech to be an object created by createSSSMLSpeechObject.

listen will never be used on its own—it sets the reprompt speech. This is the speech that Alexa will perform if a user hasn’t responded to the initial prompt within 8 seconds. This is at once a long time, and not that long at all. It sets shouldEndSession to false, as otherwise Alexa will not wait for the reprompt. No other method changes the value of shouldEndSession, so you can be confident that if you call listen that the line will stay open.

cardRenderer and linkAccountCard

cardRenderer(cardTitle, cardContent, cardImage) {
  const card = buildCard(cardTitle, cardContent, cardImage);
  this._responseObject.response.card = card;
  return this;
}

linkAccountCard() {
  this._responseObject.response.card = {
    type: CARD_TYPES.LINK_ACCOUNT
  };
  return this;
}

Two card related methods, both of which set the value of response.card. This means that you can’t send both a standard or simple card and a link account card. We’ve already looked at the buildCard function, so I’ll just point again that you need to provide cardTitle, cardContent, and (optionally) a cardImage object if you’re using cardRenderer and nothing at all if you’re using linkAccountCard. You are likely asking: how does Alexa know where to send the user to link the account? You will set this URL in the developer console.

Playing Audio

audioPlayer(directiveType, behavior, url, token, expectedPreviousToken, offsetInMilliseconds) {
  if (directiveType === 'play') {
    return this.audioPlayerPlay(behavior, url, token, expectedPreviousToken, offsetInMilliseconds);
  } else if (directiveType === 'stop') {
    return this.audioPlayerStop();
  } else {
    return this.audioPlayerClearQueue(behavior);
  }
}

audioPlayerPlay(behavior, url, token, expectedPreviousToken, offsetInMilliseconds) {
  const audioPlayerDirective = {
    type : DIRECTIVE_TYPES.AUDIOPLAYER.PLAY,
    playBehavior: behavior,
    audioItem: {
      stream: {
        url: url,
        token: token,
        expectedPreviousToken: expectedPreviousToken,
        offsetInMilliseconds: offsetInMilliseconds
      }
    }
  };

  this._addDirective(audioPlayerDirective);
  return this;
}

audioPlayerStop() {
  const audioPlayerDirective = {
    'type': DIRECTIVE_TYPES.AUDIOPLAYER.STOP
  };

  this._addDirective(audioPlayerDirective);
  return this;
}

audioPlayerClearQueue(clearBehavior) {
  const audioPlayerDirective = {
    type : DIRECTIVE_TYPES.AUDIOPLAYER.CLEAR_QUEUE,
    clearBehavior : clearBehavior
  };

  this._addDirective(audioPlayerDirective);
  return this;
}

Now we’ve got the audioPlayer method, which is just a convenience for three methods for playing audio: play, stop, and clearing the queue.

This is nearly exactly what we saw before. The only difference is that a lot of duplication was removed (before, the directive object was built in both audioPlayer and each individual method) and that there’s now a new method for adding a directive (_addDirective). We’ll look at _addDirective near the end of the post, but it’s nothing too special: it adds the directive to an array.

Since these are the same as before, please forgive me if I repeat myself (this will be new to you if you haven’t read the previous version of this post):

The audioPlayer method takes up to six arguments. The first one is always mandatory and is the action you wish to take. The options are play, stop, and clearQueue. If you provide anything else, you might as well provide clearQueue because it’s the fallthrough case.

For the remaining, all are mandatory if you’re playing audio. No further are needed if you’re stopping audio. And the second (behavior) is necessary if you’re clearing the queue.

Because this does the same as the next three combined, we’ll just look directly at those.

audioPlayerPlay

audioPlayerPlay(behavior, url, token, expectedPreviousToken, offsetInMilliseconds) {
  const audioPlayerDirective = {
    type : DIRECTIVE_TYPES.AUDIOPLAYER.PLAY,
    playBehavior: behavior,
    audioItem: {
      stream: {
        url: url,
        token: token,
        expectedPreviousToken: expectedPreviousToken,
        offsetInMilliseconds: offsetInMilliseconds
      }
    }
  };

  this._addDirective(audioPlayerDirective);
  return this;
}

The audioPlayerPlay method (or, as I like to call it, the Audio Player, Play On method) will play a stream of long-form audio. It is—as all long-form audio capabilties are—unsupported on Fire TV.

The first argument is behavior, which accepts one of three values:

  • ENQUEUE Will play the new stream after what is currently in the queue.
  • REPLACE_ALL Replaces all in the queue, including the currently playing and immediately plays the new stream.
  • REPLACE_ENQUEUED Replaces everything in the queue after the current stream that is playing. Does not stop the current stream.

The SDK will not throw an error if you include another value, but don’t do it. Seriously.

The second argument is url, which is the location of the audio to stream. This must point to an HTTPS URL and can be MP3, AAC, MP4, HLS, PLS, and M3U.

The third argument is token, which represents the stream and is 1024 characters or less. This is required because of the next argument.

The fourth argument is expectedPreviousToken. This is, essentially, what stream should come before this one. This is used in situations where the expected behavior and behavior triggered by the user would potentially cause trouble (for example, a user saying “previous track” right as the current track is ending). This is only allowed and is required when the behavior is ENQUEUE. The SDK will not, but the platform will throw an error otherwise.

The last argument is offsetInMilliseconds. It’s a timestamp representing where in the stream playback should start. 0, of course, starts at the beginning. A developer might use this in a playback mode where a user is coming back to a certain point (e.g. an individual music track maybe not, a recording of a concert, yes).

audioPlayerStop

audioPlayerStop() {
  const audioPlayerDirective = {
    'type': DIRECTIVE_TYPES.AUDIOPLAYER.STOP
  };

  this._addDirective(audioPlayerDirective);
  return this;
}

Stops the stream. No arguments necessary. I don’t think we need to get too much into this.

audioPlayerClearQueue

audioPlayerClearQueue(clearBehavior) {
  const audioPlayerDirective = {
    type : DIRECTIVE_TYPES.AUDIOPLAYER.CLEAR_QUEUE,
    clearBehavior : clearBehavior
  };

  this._addDirective(audioPlayerDirective);
  return this;
}

Finally, audioPlayerClearQueue will clear the queue following a clearBehavior. The options for clearBehavior are 'CLEAR_ENQUEUED' or 'CLEAR_ALL'. The difference between the two is that 'CLEAR_ENQUEUED' will clear all after the currently playing stream and continue playback, while 'CLEAR_ALL' will also clear the current stream and stop playback.

Echo Show Capabilities

And now we get into the reason for the new SDK! The Echo Show is here and it’s fantastic. Yet, for a while, we had no way of using this SDK to build skills that fully took advantage of the Echo Show’s capabilities.

Templates are layouts for displaying information in the Echo Show. There are two parent types of templates: body templates and list templates. Each of these has sub-types.

With a body template, the user sees text and unselectable images.

With a list template, the user sees a list of text and potentially images. In this type of template, the images can be selected.

We’ll look in-depth at templates in a future post. What you need to know is that you will almost never build a template on your own. Rather, you’ll use the template builder classes that are provided in the SDK (again, coming up in a future post).

renderTemplate

renderTemplate(template) {
  const templateDirective = {
    type : DIRECTIVE_TYPES.DISPLAY.RENDER_TEMPLATE,
    template : template
  };

  this._addDirective(templateDirective);
  return this;
}

renderTemplate takes a template object, places into another object along with the RENDER_TEMPLATE type, then passes that along to _addDirective. Remember: _addDirective builds up an array of directives that are being used. A bit confused about how this might look in practice?

const builder = new Alexa.templateBuilders.BodyTemplate2Builder();
let template = builder.setTitle('title')
                      .setTextContent(Alexa.utils.TextUtils.makePlainText('Here is some text'))
                      .build();
this
  .response
  .speak(`Hello`)
  .hint('Hey there')
  .renderTemplate(template);
this.emit(':responseReady');

What I left out here is really important: you must wrap this in a conditional that checks to see if the device can render templates (if this.event.context.System.device.supportedInterfaces.Display is truthy). If you do not do this and someone interacts with your skill elsewhere, like an Echo Dot, Alexa will throw an error. There’s value, true, in forcing the developer to be explicit about what they’re doing and where. Google Assistant requires this same capability check. On the other hand, it’s a bit of an extra load and it’d be nice if Alexa could just toss away anything that wasn’t valid. Alas, that’s not how it is.

hint

hint(hintText, hintType) {
  if(!hintType) {
    hintType = HINT_TYPES.PLAIN_TEXT;
  }

  const hintDirective = {
    type : DIRECTIVE_TYPES.HINT,
    hint : {
      type : hintType,
      text : hintText
    }
  };

  this._addDirective(hintDirective);
  return this;
}

A hint is displayed on the bottom of supported templates (all but BodyTemplate1, BodyTemplate3, and ListTemplate1) like so:

Try “${wakeWord}, ${hintText}”

You set the hintText, while the wakeWord is specific to the device and can’t be changed by you. You would use this in a situation where you wanted to encourage further interaction with your skill. It provides your users with a hint of what they can do next.

You can see in the code that there are two arguments: hintText and hintType. The only hintType right now is plain text. The fact that there’s a possibility here opens up the question of whether rich text hints will be available in the future. It’s easy to see that you can link between templates, much like you can right now in the body text. As of today, if you try to set the value to be "RichText", Alexa will throw an error.

playVideo

playVideo(source, metadata) {
  const playVideoDirective = {
    type : DIRECTIVE_TYPES.VIDEOAPP.LAUNCH,
    videoItem : {
      source : source
    }
  };

  if (playVideoDirective.videoItem.metadata) {
    playVideoDirective.videoItem.metadata = metadata;
  }

  // Note : shouldEndSession flag is not allowed with LaunchVideoApp.Launch Directive
  delete this._responseObject.response.shouldEndSession;
  this._addDirective(playVideoDirective);
  return this;
}

The playVideo method does just what it says: plays a video. Right now it is only on the Echo Show and is not supported on the Fire TV.

This method requires a source, which must be either HLS or H.264 format. To use this method, you must specify when setting up your app that it plays videos and implement the AMAZON.PauseIntent/AMAZON.StopIntent (these, in fact, are effectively the same) and AMAZON.ResumeIntent.

You can optionally include metadata. This is an object that includes a title and subtitle:

{
  title: 'Dog Greeting Returning Soldier Video',
  subtitle: 'Try not to cry.'
}

Something interesting is the deletion of shouldEndSession on the response object. This makes sense, because you wouldn’t want to end the session just as you’re starting up a video. However, what stands out is that including a null or false value would trigger an error on Alexa’s side. What’s also interesting is that you can’t do an offset of the video, like you can with audio.

_addDirective

_addDirective(directive) {
  if(!Array.isArray(this._responseObject.response.directives)) {
    this._responseObject.response.directives = [];
  }

  this._responseObject.response.directives.push(directive);
}

We’ve seen this method several times and can tell with the trailing _ that it’s not meant to be used outside of this class. It does two things:

  • Checks to see if the directive property on the response object is an array (really a check for it being undefined). Sets it to an empty array if it’s not.
  • Pushes the specified directive into the array.

You might be hard-pressed to think of a situation where you would need multiple directives, but you’ll likely use more than one more often than not: when you render a template and display a hint, you’re using multiple directives. This will be the most common situation.

In this post we looked at the new ResponseBuilder class. It added new functionality for the Echo Show, like template rendering, hints, and video playing. In the next post in the Dig Deep series, we’ll look at the template builder class. Until then…