In this post I am going to share the approach I took to synthesize an utterance from a text with automatic language detection. I will provide the sample code in both Objective-C and Swift 3.0 languages below.

In order to synthesize an utterance I had to use AVFoundation library and AVSpeechSynthesizer and AVSpeechUtterance classes in particular. AVSpeechSynthesizer is used to pronounce utterances and AVSpeechUtterance class allows to setup various properties for an utterance including a rate, voice and volume. Before I passed the utterance to synthesizer I had to activate an audio session for Playback category with DuckOthers option (feel free to pick other categories or options depending on your application’s behaviour or specific use case). Once synthesizer has finished speaking the last utterance out I had to deactivate previously created audio session.

For automatic language detection I have used NSLinguisticTagger (from Foundation library), which allowed me to detect a language for a given NSString(String) object. Then I had to iterate over the speech voices installed on a device (or simulator) to find a corresponding match for text language. If an appropriate match couldn’t be found or if the text language couldn’t be detected (i.e in cases where the text is too short or other cases when tagger returns “und” for undefined) I had to return a default language.

Below is a sample code I used for Swift version of implementation:

@objc
class TextToSpeechUtils: NSObject, AVSpeechSynthesizerDelegate {

    let synthesizer = AVSpeechSynthesizer()
    let audioSession = AVAudioSession.sharedInstance()
    let defaultLanguage = "en-US"
    var lastPlayingUtterance: AVSpeechUtterance?

    public func synthesizeSpeech(forText text: String) {

        if (text.isEmpty) { return }

        do {
            try audioSession.setCategory(AVAudioSessionCategoryPlayback, with: [.duckOthers])
            try audioSession.setActive(true)
        } catch {
            return
        }

        let utterance = AVSpeechUtterance(string:text)
        utterance.rate = AVSpeechUtteranceDefaultSpeechRate
        utterance.volume = 0.7
        utterance.voice = AVSpeechSynthesisVoice(language: detectLanguageFromText(text))
        self.synthesizer.speak(utterance)

        self.lastPlayingUtterance = utterance
    }

    func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
        if (synthesizer == self.synthesizer && self.lastPlayingUtterance == utterance) {
            do {
                // after last utterance has played - deactivate the audio session
                try self.audioSession.setActive(false);
            } catch {
                return
            }
        }
    }

    private func detectLanguageFromText(_ text: String) -> String {
        let tagger = NSLinguisticTagger.init(tagSchemes: [NSLinguisticTagSchemeLanguage], options: 0)
        tagger.string = text
        let textLanguage = tagger.tag(at: 0, scheme: NSLinguisticTagSchemeLanguage, tokenRange: nil, sentenceRange: nil)
        var detectedLanguage: String?
        for installedLanguage in AVSpeechSynthesisVoice.speechVoices() {
            let languageStringParts = installedLanguage.language.components(separatedBy: "-")
            if (languageStringParts.count > 0 && languageStringParts[0] == textLanguage) {
                detectedLanguage = installedLanguage.language
                break
            }
        }

        // if language could not be detected return default language
        return detectedLanguage ?? defaultLanguage
    }
}

And here is Objective-C version of the same code. Header file:

#import <Foundation/Foundation.h>
#import <AVFoundation/AVFoundation.h>

@interface TSTextToSpeechUtils : NSObject<AVSpeechSynthesizerDelegate>

- (void)synthesizeSpeechForText:(NSString *)text;

@end

Implementation file:

#import "TSTextToSpeechUtils.h"

@interface TSTextToSpeechUtils ()

@property (strong, nonatomic) AVSpeechSynthesizer *synthesizer;
@property (strong, nonatomic) AVSpeechUtterance *lastPlayingUtterance;
@property (strong, nonatomic) AVAudioSession *audioSession;

@end

@implementation TSTextToSpeechUtils

- (instancetype)init
{
    if ((self = [super init])) {
        _synthesizer = [[AVSpeechSynthesizer alloc] init];
        _synthesizer.delegate = self;
    }
    return self;
}

- (void)synthesizeSpeechForText:(NSString *)text
{
    if ([text length] == 0) {
        return;
    }

    self.audioSession = [AVAudioSession sharedInstance];

    NSError *error;

    // activate audioSession to play utterance
    [self.audioSession setCategory:AVAudioSessionCategoryPlayback withOptions:AVAudioSessionCategoryOptionDuckOthers error:&error];
    [self.audioSession setActive:YES error:&error];

    AVSpeechUtterance *utterance = [[AVSpeechUtterance alloc] initWithString:text];
    utterance.rate = AVSpeechUtteranceDefaultSpeechRate;
    utterance.voice = [AVSpeechSynthesisVoice voiceWithLanguage:[self detectLanguageFromText:text]];
    utterance.volume = 0.7;
    [self.synthesizer speakUtterance:utterance];

    self.lastPlayingUtterance = utterance;
}

- (void)speechSynthesizer:(AVSpeechSynthesizer *)synthesizer didFinishSpeechUtterance:(AVSpeechUtterance *)utterance
{
    if (synthesizer == self.synthesizer && self.lastPlayingUtterance == utterance) {
        NSError *error;
        // after last utterance has played - deactivate the audio session
        [self.audioSession setActive:NO error:&error];
    }
}

- (NSString *)detectLanguageFromText:(NSString *)text
{
    NSArray *tagSchemes = [NSArray arrayWithObjects:NSLinguisticTagSchemeLanguage, nil];
    NSLinguisticTagger *tagger = [[NSLinguisticTagger alloc] initWithTagSchemes:tagSchemes options:0];
    [tagger setString:text];
    NSString *textLanguage = [tagger tagAtIndex:0 scheme:NSLinguisticTagSchemeLanguage tokenRange:nil sentenceRange:nil];

    NSString *detectedLanguage = nil;

    // check if the text language exists within installed languages
    for (id installedLanguage in [AVSpeechSynthesisVoice speechVoices]) {
        NSArray *languageStringParts = [[installedLanguage language] componentsSeparatedByString:@"-"];
        if (languageStringParts.count > 0 && [languageStringParts[0] isEqualToString:textLanguage]) {
            detectedLanguage = [installedLanguage language];
            break;
        }
    }

    if (detectedLanguage == nil) {
        // if language could not be detected assign to default
        detectedLanguage = @"en-US";
    }
    return detectedLanguage;
}

@end