Home Reference Source

src/remux/mp4-remuxer.ts

  1. import AAC from './aac-helper';
  2. import MP4 from './mp4-generator';
  3. import type { HlsEventEmitter } from '../events';
  4. import { Events } from '../events';
  5. import { ErrorTypes, ErrorDetails } from '../errors';
  6. import { logger } from '../utils/logger';
  7. import {
  8. InitSegmentData,
  9. Remuxer,
  10. RemuxerResult,
  11. RemuxedMetadata,
  12. RemuxedTrack,
  13. RemuxedUserdata,
  14. } from '../types/remuxer';
  15. import { PlaylistLevelType } from '../types/loader';
  16. import { toMsFromMpegTsClock } from '../utils/timescale-conversion';
  17. import type {
  18. AudioSample,
  19. AvcSample,
  20. DemuxedAudioTrack,
  21. DemuxedAvcTrack,
  22. DemuxedMetadataTrack,
  23. DemuxedUserdataTrack,
  24. } from '../types/demuxer';
  25. import type { TrackSet } from '../types/track';
  26. import type { SourceBufferName } from '../types/buffer';
  27. import type { Fragment } from '../loader/fragment';
  28. import type { HlsConfig } from '../config';
  29.  
  30. const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds
  31. const AAC_SAMPLES_PER_FRAME = 1024;
  32. const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152;
  33.  
  34. let chromeVersion: number | null = null;
  35. let safariWebkitVersion: number | null = null;
  36.  
  37. export default class MP4Remuxer implements Remuxer {
  38. private observer: HlsEventEmitter;
  39. private config: HlsConfig;
  40. private typeSupported: any;
  41. private ISGenerated: boolean = false;
  42. private _initPTS!: number;
  43. private _initDTS!: number;
  44. private nextAvcDts: number | null = null;
  45. private nextAudioPts: number | null = null;
  46. private videoSampleDuration: number | null = null;
  47. private isAudioContiguous: boolean = false;
  48. private isVideoContiguous: boolean = false;
  49.  
  50. constructor(
  51. observer: HlsEventEmitter,
  52. config: HlsConfig,
  53. typeSupported,
  54. vendor = ''
  55. ) {
  56. this.observer = observer;
  57. this.config = config;
  58. this.typeSupported = typeSupported;
  59. this.ISGenerated = false;
  60.  
  61. if (chromeVersion === null) {
  62. const userAgent = navigator.userAgent || '';
  63. const result = userAgent.match(/Chrome\/(\d+)/i);
  64. chromeVersion = result ? parseInt(result[1]) : 0;
  65. }
  66. if (safariWebkitVersion === null) {
  67. const result = navigator.userAgent.match(/Safari\/(\d+)/i);
  68. safariWebkitVersion = result ? parseInt(result[1]) : 0;
  69. }
  70. }
  71.  
  72. destroy() {}
  73.  
  74. resetTimeStamp(defaultTimeStamp) {
  75. logger.log('[mp4-remuxer]: initPTS & initDTS reset');
  76. this._initPTS = this._initDTS = defaultTimeStamp;
  77. }
  78.  
  79. resetNextTimestamp() {
  80. logger.log('[mp4-remuxer]: reset next timestamp');
  81. this.isVideoContiguous = false;
  82. this.isAudioContiguous = false;
  83. }
  84.  
  85. resetInitSegment() {
  86. logger.log('[mp4-remuxer]: ISGenerated flag reset');
  87. this.ISGenerated = false;
  88. }
  89.  
  90. getVideoStartPts(videoSamples) {
  91. let rolloverDetected = false;
  92. const startPTS = videoSamples.reduce((minPTS, sample) => {
  93. const delta = sample.pts - minPTS;
  94. if (delta < -4294967296) {
  95. // 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation
  96. rolloverDetected = true;
  97. return normalizePts(minPTS, sample.pts);
  98. } else if (delta > 0) {
  99. return minPTS;
  100. } else {
  101. return sample.pts;
  102. }
  103. }, videoSamples[0].pts);
  104. if (rolloverDetected) {
  105. logger.debug('PTS rollover detected');
  106. }
  107. return startPTS;
  108. }
  109.  
  110. remux(
  111. audioTrack: DemuxedAudioTrack,
  112. videoTrack: DemuxedAvcTrack,
  113. id3Track: DemuxedMetadataTrack,
  114. textTrack: DemuxedUserdataTrack,
  115. timeOffset: number,
  116. accurateTimeOffset: boolean,
  117. flush: boolean,
  118. playlistType: PlaylistLevelType
  119. ): RemuxerResult {
  120. let video: RemuxedTrack | undefined;
  121. let audio: RemuxedTrack | undefined;
  122. let initSegment: InitSegmentData | undefined;
  123. let text: RemuxedUserdata | undefined;
  124. let id3: RemuxedMetadata | undefined;
  125. let independent: boolean | undefined;
  126. let audioTimeOffset = timeOffset;
  127. let videoTimeOffset = timeOffset;
  128.  
  129. // If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding.
  130. // This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid"
  131. // parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list.
  132. // However, if the initSegment has already been generated, or we've reached the end of a segment (flush),
  133. // then we can remux one track without waiting for the other.
  134. const hasAudio = audioTrack.pid > -1;
  135. const hasVideo = videoTrack.pid > -1;
  136. const length = videoTrack.samples.length;
  137. const enoughAudioSamples = audioTrack.samples.length > 0;
  138. const enoughVideoSamples = (flush && length > 0) || length > 1;
  139. const canRemuxAvc =
  140. ((!hasAudio || enoughAudioSamples) &&
  141. (!hasVideo || enoughVideoSamples)) ||
  142. this.ISGenerated ||
  143. flush;
  144.  
  145. if (canRemuxAvc) {
  146. if (!this.ISGenerated) {
  147. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  148. }
  149.  
  150. const isVideoContiguous = this.isVideoContiguous;
  151. let firstKeyFrameIndex = -1;
  152. let firstKeyFramePTS;
  153.  
  154. if (enoughVideoSamples) {
  155. firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples);
  156. if (!isVideoContiguous && this.config.forceKeyFrameOnDiscontinuity) {
  157. independent = true;
  158. if (firstKeyFrameIndex > 0) {
  159. logger.warn(
  160. `[mp4-remuxer]: Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`
  161. );
  162. const startPTS = this.getVideoStartPts(videoTrack.samples);
  163. videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex);
  164. videoTrack.dropped += firstKeyFrameIndex;
  165. videoTimeOffset +=
  166. (videoTrack.samples[0].pts - startPTS) /
  167. videoTrack.inputTimeScale;
  168. firstKeyFramePTS = videoTimeOffset;
  169. } else if (firstKeyFrameIndex === -1) {
  170. logger.warn(
  171. `[mp4-remuxer]: No keyframe found out of ${length} video samples`
  172. );
  173. independent = false;
  174. }
  175. }
  176. }
  177.  
  178. if (this.ISGenerated) {
  179. if (enoughAudioSamples && enoughVideoSamples) {
  180. // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
  181. // if first audio DTS is not aligned with first video DTS then we need to take that into account
  182. // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
  183. // drift between audio and video streams
  184. const startPTS = this.getVideoStartPts(videoTrack.samples);
  185. const tsDelta =
  186. normalizePts(audioTrack.samples[0].pts, startPTS) - startPTS;
  187. const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale;
  188. audioTimeOffset += Math.max(0, audiovideoTimestampDelta);
  189. videoTimeOffset += Math.max(0, -audiovideoTimestampDelta);
  190. }
  191.  
  192. // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio.
  193. if (enoughAudioSamples) {
  194. // if initSegment was generated without audio samples, regenerate it again
  195. if (!audioTrack.samplerate) {
  196. logger.warn(
  197. '[mp4-remuxer]: regenerate InitSegment as audio detected'
  198. );
  199. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  200. }
  201. audio = this.remuxAudio(
  202. audioTrack,
  203. audioTimeOffset,
  204. this.isAudioContiguous,
  205. accurateTimeOffset,
  206. hasVideo ||
  207. enoughVideoSamples ||
  208. playlistType === PlaylistLevelType.AUDIO
  209. ? videoTimeOffset
  210. : undefined
  211. );
  212. if (enoughVideoSamples) {
  213. const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0;
  214. // if initSegment was generated without video samples, regenerate it again
  215. if (!videoTrack.inputTimeScale) {
  216. logger.warn(
  217. '[mp4-remuxer]: regenerate InitSegment as video detected'
  218. );
  219. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  220. }
  221. video = this.remuxVideo(
  222. videoTrack,
  223. videoTimeOffset,
  224. isVideoContiguous,
  225. audioTrackLength
  226. );
  227. }
  228. } else if (enoughVideoSamples) {
  229. video = this.remuxVideo(
  230. videoTrack,
  231. videoTimeOffset,
  232. isVideoContiguous,
  233. 0
  234. );
  235. }
  236. if (video) {
  237. video.firstKeyFrame = firstKeyFrameIndex;
  238. video.independent = firstKeyFrameIndex !== -1;
  239. video.firstKeyFramePTS = firstKeyFramePTS;
  240. }
  241. }
  242. }
  243.  
  244. // Allow ID3 and text to remux, even if more audio/video samples are required
  245. if (this.ISGenerated) {
  246. if (id3Track.samples.length) {
  247. id3 = flushTextTrackMetadataCueSamples(
  248. id3Track,
  249. timeOffset,
  250. this._initPTS,
  251. this._initDTS
  252. );
  253. }
  254.  
  255. if (textTrack.samples.length) {
  256. text = flushTextTrackUserdataCueSamples(
  257. textTrack,
  258. timeOffset,
  259. this._initPTS
  260. );
  261. }
  262. }
  263.  
  264. return {
  265. audio,
  266. video,
  267. initSegment,
  268. independent,
  269. text,
  270. id3,
  271. };
  272. }
  273.  
  274. generateIS(
  275. audioTrack: DemuxedAudioTrack,
  276. videoTrack: DemuxedAvcTrack,
  277. timeOffset
  278. ): InitSegmentData | undefined {
  279. const audioSamples = audioTrack.samples;
  280. const videoSamples = videoTrack.samples;
  281. const typeSupported = this.typeSupported;
  282. const tracks: TrackSet = {};
  283. const computePTSDTS = !Number.isFinite(this._initPTS);
  284. let container = 'audio/mp4';
  285. let initPTS: number | undefined;
  286. let initDTS: number | undefined;
  287. let timescale: number | undefined;
  288.  
  289. if (computePTSDTS) {
  290. initPTS = initDTS = Infinity;
  291. }
  292.  
  293. if (audioTrack.config && audioSamples.length) {
  294. // let's use audio sampling rate as MP4 time scale.
  295. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
  296. // using audio sampling rate here helps having an integer MP4 frame duration
  297. // this avoids potential rounding issue and AV sync issue
  298. audioTrack.timescale = audioTrack.samplerate;
  299. switch (audioTrack.segmentCodec) {
  300. case 'mp3':
  301. if (typeSupported.mpeg) {
  302. // Chrome and Safari
  303. container = 'audio/mpeg';
  304. audioTrack.codec = '';
  305. } else if (typeSupported.mp3) {
  306. // Firefox
  307. audioTrack.codec = 'mp3';
  308. }
  309. break;
  310. }
  311. tracks.audio = {
  312. id: 'audio',
  313. container: container,
  314. codec: audioTrack.codec,
  315. initSegment:
  316. audioTrack.segmentCodec === 'mp3' && typeSupported.mpeg
  317. ? new Uint8Array(0)
  318. : MP4.initSegment([audioTrack]),
  319. metadata: {
  320. channelCount: audioTrack.channelCount,
  321. },
  322. };
  323. if (computePTSDTS) {
  324. timescale = audioTrack.inputTimeScale;
  325. // remember first PTS of this demuxing context. for audio, PTS = DTS
  326. initPTS = initDTS =
  327. audioSamples[0].pts - Math.round(timescale * timeOffset);
  328. }
  329. }
  330.  
  331. if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
  332. // let's use input time scale as MP4 video timescale
  333. // we use input time scale straight away to avoid rounding issues on frame duration / cts computation
  334. videoTrack.timescale = videoTrack.inputTimeScale;
  335. tracks.video = {
  336. id: 'main',
  337. container: 'video/mp4',
  338. codec: videoTrack.codec,
  339. initSegment: MP4.initSegment([videoTrack]),
  340. metadata: {
  341. width: videoTrack.width,
  342. height: videoTrack.height,
  343. },
  344. };
  345. if (computePTSDTS) {
  346. timescale = videoTrack.inputTimeScale;
  347. const startPTS = this.getVideoStartPts(videoSamples);
  348. const startOffset = Math.round(timescale * timeOffset);
  349. initDTS = Math.min(
  350. initDTS as number,
  351. normalizePts(videoSamples[0].dts, startPTS) - startOffset
  352. );
  353. initPTS = Math.min(initPTS as number, startPTS - startOffset);
  354. }
  355. }
  356.  
  357. if (Object.keys(tracks).length) {
  358. this.ISGenerated = true;
  359. if (computePTSDTS) {
  360. this._initPTS = initPTS as number;
  361. this._initDTS = initDTS as number;
  362. }
  363.  
  364. return {
  365. tracks,
  366. initPTS,
  367. timescale,
  368. };
  369. }
  370. }
  371.  
  372. remuxVideo(
  373. track: DemuxedAvcTrack,
  374. timeOffset: number,
  375. contiguous: boolean,
  376. audioTrackLength: number
  377. ): RemuxedTrack | undefined {
  378. const timeScale: number = track.inputTimeScale;
  379. const inputSamples: Array<AvcSample> = track.samples;
  380. const outputSamples: Array<Mp4Sample> = [];
  381. const nbSamples: number = inputSamples.length;
  382. const initPTS: number = this._initPTS;
  383. let nextAvcDts = this.nextAvcDts;
  384. let offset = 8;
  385. let mp4SampleDuration = this.videoSampleDuration;
  386. let firstDTS;
  387. let lastDTS;
  388. let minPTS: number = Number.POSITIVE_INFINITY;
  389. let maxPTS: number = Number.NEGATIVE_INFINITY;
  390. let sortSamples = false;
  391.  
  392. // if parsed fragment is contiguous with last one, let's use last DTS value as reference
  393. if (!contiguous || nextAvcDts === null) {
  394. const pts = timeOffset * timeScale;
  395. const cts =
  396. inputSamples[0].pts -
  397. normalizePts(inputSamples[0].dts, inputSamples[0].pts);
  398. // if not contiguous, let's use target timeOffset
  399. nextAvcDts = pts - cts;
  400. }
  401.  
  402. // PTS is coded on 33bits, and can loop from -2^32 to 2^32
  403. // PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
  404. for (let i = 0; i < nbSamples; i++) {
  405. const sample = inputSamples[i];
  406. sample.pts = normalizePts(sample.pts - initPTS, nextAvcDts);
  407. sample.dts = normalizePts(sample.dts - initPTS, nextAvcDts);
  408. if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) {
  409. sortSamples = true;
  410. }
  411. }
  412.  
  413. // sort video samples by DTS then PTS then demux id order
  414. if (sortSamples) {
  415. inputSamples.sort(function (a, b) {
  416. const deltadts = a.dts - b.dts;
  417. const deltapts = a.pts - b.pts;
  418. return deltadts || deltapts;
  419. });
  420. }
  421.  
  422. // Get first/last DTS
  423. firstDTS = inputSamples[0].dts;
  424. lastDTS = inputSamples[inputSamples.length - 1].dts;
  425.  
  426. // Sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
  427. // set this constant duration as being the avg delta between consecutive DTS.
  428. const inputDuration = lastDTS - firstDTS;
  429. const averageSampleDuration = inputDuration
  430. ? Math.round(inputDuration / (nbSamples - 1))
  431. : mp4SampleDuration || track.inputTimeScale / 30;
  432.  
  433. // if fragment are contiguous, detect hole/overlapping between fragments
  434. if (contiguous) {
  435. // check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole)
  436. const delta = firstDTS - nextAvcDts;
  437. const foundHole = delta > averageSampleDuration;
  438. const foundOverlap = delta < -1;
  439. if (foundHole || foundOverlap) {
  440. if (foundHole) {
  441. logger.warn(
  442. `AVC: ${toMsFromMpegTsClock(
  443. delta,
  444. true
  445. )} ms (${delta}dts) hole between fragments detected, filling it`
  446. );
  447. } else {
  448. logger.warn(
  449. `AVC: ${toMsFromMpegTsClock(
  450. -delta,
  451. true
  452. )} ms (${delta}dts) overlapping between fragments detected`
  453. );
  454. }
  455. if (!foundOverlap || nextAvcDts > inputSamples[0].pts) {
  456. firstDTS = nextAvcDts;
  457. const firstPTS = inputSamples[0].pts - delta;
  458. inputSamples[0].dts = firstDTS;
  459. inputSamples[0].pts = firstPTS;
  460. logger.log(
  461. `Video: First PTS/DTS adjusted: ${toMsFromMpegTsClock(
  462. firstPTS,
  463. true
  464. )}/${toMsFromMpegTsClock(
  465. firstDTS,
  466. true
  467. )}, delta: ${toMsFromMpegTsClock(delta, true)} ms`
  468. );
  469. }
  470. }
  471. }
  472.  
  473. firstDTS = Math.max(0, firstDTS);
  474.  
  475. let nbNalu = 0;
  476. let naluLen = 0;
  477. for (let i = 0; i < nbSamples; i++) {
  478. // compute total/avc sample length and nb of NAL units
  479. const sample = inputSamples[i];
  480. const units = sample.units;
  481. const nbUnits = units.length;
  482. let sampleLen = 0;
  483. for (let j = 0; j < nbUnits; j++) {
  484. sampleLen += units[j].data.length;
  485. }
  486.  
  487. naluLen += sampleLen;
  488. nbNalu += nbUnits;
  489. sample.length = sampleLen;
  490.  
  491. // ensure sample monotonic DTS
  492. sample.dts = Math.max(sample.dts, firstDTS);
  493.  
  494. minPTS = Math.min(sample.pts, minPTS);
  495. maxPTS = Math.max(sample.pts, maxPTS);
  496. }
  497. lastDTS = inputSamples[nbSamples - 1].dts;
  498.  
  499. /* concatenate the video data and construct the mdat in place
  500. (need 8 more bytes to fill length and mpdat type) */
  501. const mdatSize = naluLen + 4 * nbNalu + 8;
  502. let mdat;
  503. try {
  504. mdat = new Uint8Array(mdatSize);
  505. } catch (err) {
  506. this.observer.emit(Events.ERROR, Events.ERROR, {
  507. type: ErrorTypes.MUX_ERROR,
  508. details: ErrorDetails.REMUX_ALLOC_ERROR,
  509. fatal: false,
  510. bytes: mdatSize,
  511. reason: `fail allocating video mdat ${mdatSize}`,
  512. });
  513. return;
  514. }
  515. const view = new DataView(mdat.buffer);
  516. view.setUint32(0, mdatSize);
  517. mdat.set(MP4.types.mdat, 4);
  518.  
  519. let stretchedLastFrame = false;
  520. let minDtsDelta = Number.POSITIVE_INFINITY;
  521. let minPtsDelta = Number.POSITIVE_INFINITY;
  522. let maxDtsDelta = Number.NEGATIVE_INFINITY;
  523. let maxPtsDelta = Number.NEGATIVE_INFINITY;
  524. for (let i = 0; i < nbSamples; i++) {
  525. const avcSample = inputSamples[i];
  526. const avcSampleUnits = avcSample.units;
  527. let mp4SampleLength = 0;
  528. // convert NALU bitstream to MP4 format (prepend NALU with size field)
  529. for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
  530. const unit = avcSampleUnits[j];
  531. const unitData = unit.data;
  532. const unitDataLen = unit.data.byteLength;
  533. view.setUint32(offset, unitDataLen);
  534. offset += 4;
  535. mdat.set(unitData, offset);
  536. offset += unitDataLen;
  537. mp4SampleLength += 4 + unitDataLen;
  538. }
  539.  
  540. // expected sample duration is the Decoding Timestamp diff of consecutive samples
  541. let ptsDelta;
  542. if (i < nbSamples - 1) {
  543. mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
  544. ptsDelta = inputSamples[i + 1].pts - avcSample.pts;
  545. } else {
  546. const config = this.config;
  547. const lastFrameDuration =
  548. i > 0
  549. ? avcSample.dts - inputSamples[i - 1].dts
  550. : averageSampleDuration;
  551. ptsDelta =
  552. i > 0
  553. ? avcSample.pts - inputSamples[i - 1].pts
  554. : averageSampleDuration;
  555. if (config.stretchShortVideoTrack && this.nextAudioPts !== null) {
  556. // In some cases, a segment's audio track duration may exceed the video track duration.
  557. // Since we've already remuxed audio, and we know how long the audio track is, we look to
  558. // see if the delta to the next segment is longer than maxBufferHole.
  559. // If so, playback would potentially get stuck, so we artificially inflate
  560. // the duration of the last frame to minimize any potential gap between segments.
  561. const gapTolerance = Math.floor(config.maxBufferHole * timeScale);
  562. const deltaToFrameEnd =
  563. (audioTrackLength
  564. ? minPTS + audioTrackLength * timeScale
  565. : this.nextAudioPts) - avcSample.pts;
  566. if (deltaToFrameEnd > gapTolerance) {
  567. // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
  568. // frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
  569. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
  570. if (mp4SampleDuration < 0) {
  571. mp4SampleDuration = lastFrameDuration;
  572. } else {
  573. stretchedLastFrame = true;
  574. }
  575. logger.log(
  576. `[mp4-remuxer]: It is approximately ${
  577. deltaToFrameEnd / 90
  578. } ms to the next segment; using duration ${
  579. mp4SampleDuration / 90
  580. } ms for the last video frame.`
  581. );
  582. } else {
  583. mp4SampleDuration = lastFrameDuration;
  584. }
  585. } else {
  586. mp4SampleDuration = lastFrameDuration;
  587. }
  588. }
  589. const compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
  590. minDtsDelta = Math.min(minDtsDelta, mp4SampleDuration);
  591. maxDtsDelta = Math.max(maxDtsDelta, mp4SampleDuration);
  592. minPtsDelta = Math.min(minPtsDelta, ptsDelta);
  593. maxPtsDelta = Math.max(maxPtsDelta, ptsDelta);
  594.  
  595. outputSamples.push(
  596. new Mp4Sample(
  597. avcSample.key,
  598. mp4SampleDuration,
  599. mp4SampleLength,
  600. compositionTimeOffset
  601. )
  602. );
  603. }
  604.  
  605. if (outputSamples.length) {
  606. if (chromeVersion) {
  607. if (chromeVersion < 70) {
  608. // Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue
  609. // https://code.google.com/p/chromium/issues/detail?id=229412
  610. const flags = outputSamples[0].flags;
  611. flags.dependsOn = 2;
  612. flags.isNonSync = 0;
  613. }
  614. } else if (safariWebkitVersion) {
  615. // Fix for "CNN special report, with CC" in test-streams (Safari browser only)
  616. // Ignore DTS when frame durations are irregular. Safari MSE does not handle this leading to gaps.
  617. if (
  618. maxPtsDelta - minPtsDelta < maxDtsDelta - minDtsDelta &&
  619. averageSampleDuration / maxDtsDelta < 0.025 &&
  620. outputSamples[0].cts === 0
  621. ) {
  622. logger.warn(
  623. 'Found irregular gaps in sample duration. Using PTS instead of DTS to determine MP4 sample duration.'
  624. );
  625. let dts = firstDTS;
  626. for (let i = 0, len = outputSamples.length; i < len; i++) {
  627. const nextDts = dts + outputSamples[i].duration;
  628. const pts = dts + outputSamples[i].cts;
  629. if (i < len - 1) {
  630. const nextPts = nextDts + outputSamples[i + 1].cts;
  631. outputSamples[i].duration = nextPts - pts;
  632. } else {
  633. outputSamples[i].duration = i
  634. ? outputSamples[i - 1].duration
  635. : averageSampleDuration;
  636. }
  637. outputSamples[i].cts = 0;
  638. dts = nextDts;
  639. }
  640. }
  641. }
  642. }
  643.  
  644. console.assert(
  645. mp4SampleDuration !== null,
  646. 'mp4SampleDuration must be computed'
  647. );
  648. // next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
  649. mp4SampleDuration =
  650. stretchedLastFrame || !mp4SampleDuration
  651. ? averageSampleDuration
  652. : mp4SampleDuration;
  653. this.nextAvcDts = nextAvcDts = lastDTS + mp4SampleDuration;
  654. this.videoSampleDuration = mp4SampleDuration;
  655. this.isVideoContiguous = true;
  656. const moof = MP4.moof(
  657. track.sequenceNumber++,
  658. firstDTS,
  659. Object.assign({}, track, {
  660. samples: outputSamples,
  661. })
  662. );
  663. const type: SourceBufferName = 'video';
  664. const data = {
  665. data1: moof,
  666. data2: mdat,
  667. startPTS: minPTS / timeScale,
  668. endPTS: (maxPTS + mp4SampleDuration) / timeScale,
  669. startDTS: firstDTS / timeScale,
  670. endDTS: (nextAvcDts as number) / timeScale,
  671. type,
  672. hasAudio: false,
  673. hasVideo: true,
  674. nb: outputSamples.length,
  675. dropped: track.dropped,
  676. };
  677.  
  678. track.samples = [];
  679. track.dropped = 0;
  680.  
  681. console.assert(mdat.length, 'MDAT length must not be zero');
  682.  
  683. return data;
  684. }
  685.  
  686. remuxAudio(
  687. track: DemuxedAudioTrack,
  688. timeOffset: number,
  689. contiguous: boolean,
  690. accurateTimeOffset: boolean,
  691. videoTimeOffset?: number
  692. ): RemuxedTrack | undefined {
  693. const inputTimeScale: number = track.inputTimeScale;
  694. const mp4timeScale: number = track.samplerate
  695. ? track.samplerate
  696. : inputTimeScale;
  697. const scaleFactor: number = inputTimeScale / mp4timeScale;
  698. const mp4SampleDuration: number =
  699. track.segmentCodec === 'aac'
  700. ? AAC_SAMPLES_PER_FRAME
  701. : MPEG_AUDIO_SAMPLE_PER_FRAME;
  702. const inputSampleDuration: number = mp4SampleDuration * scaleFactor;
  703. const initPTS: number = this._initPTS;
  704. const rawMPEG: boolean =
  705. track.segmentCodec === 'mp3' && this.typeSupported.mpeg;
  706. const outputSamples: Array<Mp4Sample> = [];
  707. const alignedWithVideo = videoTimeOffset !== undefined;
  708.  
  709. let inputSamples: Array<AudioSample> = track.samples;
  710. let offset: number = rawMPEG ? 0 : 8;
  711. let nextAudioPts: number = this.nextAudioPts || -1;
  712.  
  713. // window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]);
  714.  
  715. // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
  716. // for sake of clarity:
  717. // consecutive fragments are frags with
  718. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  719. // - less than 20 audio frames distance
  720. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  721. // this helps ensuring audio continuity
  722. // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
  723. const timeOffsetMpegTS = timeOffset * inputTimeScale;
  724. this.isAudioContiguous = contiguous =
  725. contiguous ||
  726. ((inputSamples.length &&
  727. nextAudioPts > 0 &&
  728. ((accurateTimeOffset &&
  729. Math.abs(timeOffsetMpegTS - nextAudioPts) < 9000) ||
  730. Math.abs(
  731. normalizePts(inputSamples[0].pts - initPTS, timeOffsetMpegTS) -
  732. nextAudioPts
  733. ) <
  734. 20 * inputSampleDuration)) as boolean);
  735.  
  736. // compute normalized PTS
  737. inputSamples.forEach(function (sample) {
  738. sample.pts = normalizePts(sample.pts - initPTS, timeOffsetMpegTS);
  739. });
  740.  
  741. if (!contiguous || nextAudioPts < 0) {
  742. // filter out sample with negative PTS that are not playable anyway
  743. // if we don't remove these negative samples, they will shift all audio samples forward.
  744. // leading to audio overlap between current / next fragment
  745. inputSamples = inputSamples.filter((sample) => sample.pts >= 0);
  746.  
  747. // in case all samples have negative PTS, and have been filtered out, return now
  748. if (!inputSamples.length) {
  749. return;
  750. }
  751.  
  752. if (videoTimeOffset === 0) {
  753. // Set the start to 0 to match video so that start gaps larger than inputSampleDuration are filled with silence
  754. nextAudioPts = 0;
  755. } else if (accurateTimeOffset && !alignedWithVideo) {
  756. // When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS
  757. nextAudioPts = Math.max(0, timeOffsetMpegTS);
  758. } else {
  759. // if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
  760. nextAudioPts = inputSamples[0].pts;
  761. }
  762. }
  763.  
  764. // If the audio track is missing samples, the frames seem to get "left-shifted" within the
  765. // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
  766. // In an effort to prevent this from happening, we inject frames here where there are gaps.
  767. // When possible, we inject a silent frame; when that's not possible, we duplicate the last
  768. // frame.
  769.  
  770. if (track.segmentCodec === 'aac') {
  771. const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
  772. for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length; i++) {
  773. // First, let's see how far off this frame is from where we expect it to be
  774. const sample = inputSamples[i];
  775. const pts = sample.pts;
  776. const delta = pts - nextPts;
  777. const duration = Math.abs((1000 * delta) / inputTimeScale);
  778.  
  779. // When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync
  780. if (
  781. delta <= -maxAudioFramesDrift * inputSampleDuration &&
  782. alignedWithVideo
  783. ) {
  784. if (i === 0) {
  785. logger.warn(
  786. `Audio frame @ ${(pts / inputTimeScale).toFixed(
  787. 3
  788. )}s overlaps nextAudioPts by ${Math.round(
  789. (1000 * delta) / inputTimeScale
  790. )} ms.`
  791. );
  792. this.nextAudioPts = nextAudioPts = nextPts = pts;
  793. }
  794. } // eslint-disable-line brace-style
  795.  
  796. // Insert missing frames if:
  797. // 1: We're more than maxAudioFramesDrift frame away
  798. // 2: Not more than MAX_SILENT_FRAME_DURATION away
  799. // 3: currentTime (aka nextPtsNorm) is not 0
  800. // 4: remuxing with video (videoTimeOffset !== undefined)
  801. else if (
  802. delta >= maxAudioFramesDrift * inputSampleDuration &&
  803. duration < MAX_SILENT_FRAME_DURATION &&
  804. alignedWithVideo
  805. ) {
  806. let missing = Math.round(delta / inputSampleDuration);
  807. // Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from
  808. // later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration.
  809. nextPts = pts - missing * inputSampleDuration;
  810. if (nextPts < 0) {
  811. missing--;
  812. nextPts += inputSampleDuration;
  813. }
  814. if (i === 0) {
  815. this.nextAudioPts = nextAudioPts = nextPts;
  816. }
  817. logger.warn(
  818. `[mp4-remuxer]: Injecting ${missing} audio frame @ ${(
  819. nextPts / inputTimeScale
  820. ).toFixed(3)}s due to ${Math.round(
  821. (1000 * delta) / inputTimeScale
  822. )} ms gap.`
  823. );
  824. for (let j = 0; j < missing; j++) {
  825. const newStamp = Math.max(nextPts as number, 0);
  826. let fillFrame = AAC.getSilentFrame(
  827. track.manifestCodec || track.codec,
  828. track.channelCount
  829. );
  830. if (!fillFrame) {
  831. logger.log(
  832. '[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating last frame instead.'
  833. );
  834. fillFrame = sample.unit.subarray();
  835. }
  836. inputSamples.splice(i, 0, {
  837. unit: fillFrame,
  838. pts: newStamp,
  839. });
  840. nextPts += inputSampleDuration;
  841. i++;
  842. }
  843. }
  844. sample.pts = nextPts;
  845. nextPts += inputSampleDuration;
  846. }
  847. }
  848. let firstPTS: number | null = null;
  849. let lastPTS: number | null = null;
  850. let mdat: any;
  851. let mdatSize: number = 0;
  852. let sampleLength: number = inputSamples.length;
  853. while (sampleLength--) {
  854. mdatSize += inputSamples[sampleLength].unit.byteLength;
  855. }
  856. for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
  857. const audioSample = inputSamples[j];
  858. const unit = audioSample.unit;
  859. let pts = audioSample.pts;
  860. if (lastPTS !== null) {
  861. // If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with
  862. // the previous sample
  863. const prevSample = outputSamples[j - 1];
  864. prevSample.duration = Math.round((pts - lastPTS) / scaleFactor);
  865. } else {
  866. if (contiguous && track.segmentCodec === 'aac') {
  867. // set PTS/DTS to expected PTS/DTS
  868. pts = nextAudioPts;
  869. }
  870. // remember first PTS of our audioSamples
  871. firstPTS = pts;
  872. if (mdatSize > 0) {
  873. /* concatenate the audio data and construct the mdat in place
  874. (need 8 more bytes to fill length and mdat type) */
  875. mdatSize += offset;
  876. try {
  877. mdat = new Uint8Array(mdatSize);
  878. } catch (err) {
  879. this.observer.emit(Events.ERROR, Events.ERROR, {
  880. type: ErrorTypes.MUX_ERROR,
  881. details: ErrorDetails.REMUX_ALLOC_ERROR,
  882. fatal: false,
  883. bytes: mdatSize,
  884. reason: `fail allocating audio mdat ${mdatSize}`,
  885. });
  886. return;
  887. }
  888. if (!rawMPEG) {
  889. const view = new DataView(mdat.buffer);
  890. view.setUint32(0, mdatSize);
  891. mdat.set(MP4.types.mdat, 4);
  892. }
  893. } else {
  894. // no audio samples
  895. return;
  896. }
  897. }
  898. mdat.set(unit, offset);
  899. const unitLen = unit.byteLength;
  900. offset += unitLen;
  901. // Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG
  902. // In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration
  903. // becomes the PTS diff with the previous sample
  904. outputSamples.push(new Mp4Sample(true, mp4SampleDuration, unitLen, 0));
  905. lastPTS = pts;
  906. }
  907.  
  908. // We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones
  909. const nbSamples = outputSamples.length;
  910. if (!nbSamples) {
  911. return;
  912. }
  913.  
  914. // The next audio sample PTS should be equal to last sample PTS + duration
  915. const lastSample = outputSamples[outputSamples.length - 1];
  916. this.nextAudioPts = nextAudioPts =
  917. lastPTS! + scaleFactor * lastSample.duration;
  918.  
  919. // Set the track samples from inputSamples to outputSamples before remuxing
  920. const moof = rawMPEG
  921. ? new Uint8Array(0)
  922. : MP4.moof(
  923. track.sequenceNumber++,
  924. firstPTS! / scaleFactor,
  925. Object.assign({}, track, { samples: outputSamples })
  926. );
  927.  
  928. // Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared
  929. track.samples = [];
  930. const start = firstPTS! / inputTimeScale;
  931. const end = nextAudioPts / inputTimeScale;
  932. const type: SourceBufferName = 'audio';
  933. const audioData = {
  934. data1: moof,
  935. data2: mdat,
  936. startPTS: start,
  937. endPTS: end,
  938. startDTS: start,
  939. endDTS: end,
  940. type,
  941. hasAudio: true,
  942. hasVideo: false,
  943. nb: nbSamples,
  944. };
  945.  
  946. this.isAudioContiguous = true;
  947.  
  948. console.assert(mdat.length, 'MDAT length must not be zero');
  949. return audioData;
  950. }
  951.  
  952. remuxEmptyAudio(
  953. track: DemuxedAudioTrack,
  954. timeOffset: number,
  955. contiguous: boolean,
  956. videoData: Fragment
  957. ): RemuxedTrack | undefined {
  958. const inputTimeScale: number = track.inputTimeScale;
  959. const mp4timeScale: number = track.samplerate
  960. ? track.samplerate
  961. : inputTimeScale;
  962. const scaleFactor: number = inputTimeScale / mp4timeScale;
  963. const nextAudioPts: number | null = this.nextAudioPts;
  964. // sync with video's timestamp
  965. const startDTS: number =
  966. (nextAudioPts !== null
  967. ? nextAudioPts
  968. : videoData.startDTS * inputTimeScale) + this._initDTS;
  969. const endDTS: number = videoData.endDTS * inputTimeScale + this._initDTS;
  970. // one sample's duration value
  971. const frameDuration: number = scaleFactor * AAC_SAMPLES_PER_FRAME;
  972. // samples count of this segment's duration
  973. const nbSamples: number = Math.ceil((endDTS - startDTS) / frameDuration);
  974. // silent frame
  975. const silentFrame: Uint8Array | undefined = AAC.getSilentFrame(
  976. track.manifestCodec || track.codec,
  977. track.channelCount
  978. );
  979.  
  980. logger.warn('[mp4-remuxer]: remux empty Audio');
  981. // Can't remux if we can't generate a silent frame...
  982. if (!silentFrame) {
  983. logger.trace(
  984. '[mp4-remuxer]: Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec'
  985. );
  986. return;
  987. }
  988.  
  989. const samples: Array<any> = [];
  990. for (let i = 0; i < nbSamples; i++) {
  991. const stamp = startDTS + i * frameDuration;
  992. samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
  993. }
  994. track.samples = samples;
  995.  
  996. return this.remuxAudio(track, timeOffset, contiguous, false);
  997. }
  998. }
  999.  
  1000. export function normalizePts(value: number, reference: number | null): number {
  1001. let offset;
  1002. if (reference === null) {
  1003. return value;
  1004. }
  1005.  
  1006. if (reference < value) {
  1007. // - 2^33
  1008. offset = -8589934592;
  1009. } else {
  1010. // + 2^33
  1011. offset = 8589934592;
  1012. }
  1013. /* PTS is 33bit (from 0 to 2^33 -1)
  1014. if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
  1015. PTS looping occured. fill the gap */
  1016. while (Math.abs(value - reference) > 4294967296) {
  1017. value += offset;
  1018. }
  1019.  
  1020. return value;
  1021. }
  1022.  
  1023. function findKeyframeIndex(samples: Array<AvcSample>): number {
  1024. for (let i = 0; i < samples.length; i++) {
  1025. if (samples[i].key) {
  1026. return i;
  1027. }
  1028. }
  1029. return -1;
  1030. }
  1031.  
  1032. export function flushTextTrackMetadataCueSamples(
  1033. track: DemuxedMetadataTrack,
  1034. timeOffset: number,
  1035. initPTS: number,
  1036. initDTS: number
  1037. ): RemuxedMetadata | undefined {
  1038. const length = track.samples.length;
  1039. if (!length) {
  1040. return;
  1041. }
  1042. const inputTimeScale = track.inputTimeScale;
  1043. for (let index = 0; index < length; index++) {
  1044. const sample = track.samples[index];
  1045. // setting id3 pts, dts to relative time
  1046. // using this._initPTS and this._initDTS to calculate relative time
  1047. sample.pts =
  1048. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1049. inputTimeScale;
  1050. sample.dts =
  1051. normalizePts(sample.dts - initDTS, timeOffset * inputTimeScale) /
  1052. inputTimeScale;
  1053. }
  1054. const samples = track.samples;
  1055. track.samples = [];
  1056. return {
  1057. samples,
  1058. };
  1059. }
  1060.  
  1061. export function flushTextTrackUserdataCueSamples(
  1062. track: DemuxedUserdataTrack,
  1063. timeOffset: number,
  1064. initPTS: number
  1065. ): RemuxedUserdata | undefined {
  1066. const length = track.samples.length;
  1067. if (!length) {
  1068. return;
  1069. }
  1070.  
  1071. const inputTimeScale = track.inputTimeScale;
  1072. for (let index = 0; index < length; index++) {
  1073. const sample = track.samples[index];
  1074. // setting text pts, dts to relative time
  1075. // using this._initPTS and this._initDTS to calculate relative time
  1076. sample.pts =
  1077. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1078. inputTimeScale;
  1079. }
  1080. track.samples.sort((a, b) => a.pts - b.pts);
  1081. const samples = track.samples;
  1082. track.samples = [];
  1083. return {
  1084. samples,
  1085. };
  1086. }
  1087.  
  1088. class Mp4Sample {
  1089. public size: number;
  1090. public duration: number;
  1091. public cts: number;
  1092. public flags: Mp4SampleFlags;
  1093.  
  1094. constructor(
  1095. isKeyframe: boolean,
  1096. duration: number,
  1097. size: number,
  1098. cts: number
  1099. ) {
  1100. this.duration = duration;
  1101. this.size = size;
  1102. this.cts = cts;
  1103. this.flags = new Mp4SampleFlags(isKeyframe);
  1104. }
  1105. }
  1106.  
  1107. class Mp4SampleFlags {
  1108. public isLeading: 0 = 0;
  1109. public isDependedOn: 0 = 0;
  1110. public hasRedundancy: 0 = 0;
  1111. public degradPrio: 0 = 0;
  1112. public dependsOn: 1 | 2 = 1;
  1113. public isNonSync: 0 | 1 = 1;
  1114.  
  1115. constructor(isKeyframe) {
  1116. this.dependsOn = isKeyframe ? 2 : 1;
  1117. this.isNonSync = isKeyframe ? 0 : 1;
  1118. }
  1119. }