patch-mptcp 497 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203
  1. diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
  2. index d286bde..34d56d7 100644
  3. --- a/drivers/infiniband/hw/cxgb4/cm.c
  4. +++ b/drivers/infiniband/hw/cxgb4/cm.c
  5. @@ -3162,7 +3162,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
  6. */
  7. memset(&tmp_opt, 0, sizeof(tmp_opt));
  8. tcp_clear_options(&tmp_opt);
  9. - tcp_parse_options(skb, &tmp_opt, 0, NULL);
  10. + tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
  11. req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
  12. memset(req, 0, sizeof(*req));
  13. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
  14. index 2faef33..9c12362 100644
  15. --- a/include/linux/ipv6.h
  16. +++ b/include/linux/ipv6.h
  17. @@ -309,12 +309,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
  18. return NULL;
  19. }
  20. -static inline struct inet6_request_sock *
  21. - inet6_rsk(const struct request_sock *rsk)
  22. -{
  23. - return NULL;
  24. -}
  25. -
  26. static inline struct raw6_sock *raw6_sk(const struct sock *sk)
  27. {
  28. return NULL;
  29. diff --git a/include/linux/tcp.h b/include/linux/tcp.h
  30. index 4ad0706..a230dd0 100644
  31. --- a/include/linux/tcp.h
  32. +++ b/include/linux/tcp.h
  33. @@ -72,6 +72,53 @@ struct tcp_sack_block {
  34. u32 end_seq;
  35. };
  36. +struct tcp_out_options {
  37. + u16 options; /* bit field of OPTION_* */
  38. + u8 ws; /* window scale, 0 to disable */
  39. + u8 num_sack_blocks;/* number of SACK blocks to include */
  40. + u8 hash_size; /* bytes in hash_location */
  41. + u16 mss; /* 0 to disable */
  42. + __u8 *hash_location; /* temporary pointer, overloaded */
  43. + __u32 tsval, tsecr; /* need to include OPTION_TS */
  44. + struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
  45. +#ifdef CONFIG_MPTCP
  46. + u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
  47. + u8 dss_csum:1,
  48. + add_addr_v4:1,
  49. + add_addr_v6:1; /* dss-checksum required? */
  50. +
  51. + __u32 data_seq; /* data sequence number, for MPTCP */
  52. + __u32 data_ack; /* data ack, for MPTCP */
  53. +
  54. + union {
  55. + struct {
  56. + __u64 sender_key; /* sender's key for mptcp */
  57. + __u64 receiver_key; /* receiver's key for mptcp */
  58. + } mp_capable;
  59. +
  60. + struct {
  61. + __u64 sender_truncated_mac;
  62. + __u32 sender_nonce;
  63. + /* random number of the sender */
  64. + __u32 token; /* token for mptcp */
  65. + } mp_join_syns;
  66. + };
  67. +
  68. + struct {
  69. + struct in_addr addr;
  70. + u8 addr_id;
  71. + } add_addr4;
  72. +
  73. + struct {
  74. + struct in6_addr addr;
  75. + u8 addr_id;
  76. + } add_addr6;
  77. +
  78. + u16 remove_addrs; /* list of address id */
  79. + u8 addr_id; /* address id (mp_join or add_address) */
  80. +#endif /* CONFIG_MPTCP */
  81. +};
  82. +
  83. /*These are used to set the sack_ok field in struct tcp_options_received */
  84. #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
  85. #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/
  86. @@ -95,6 +142,9 @@ struct tcp_options_received {
  87. u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
  88. };
  89. +struct mptcp_cb;
  90. +struct mptcp_tcp_sock;
  91. +
  92. static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
  93. {
  94. rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
  95. @@ -123,6 +173,7 @@ struct tcp_request_sock {
  96. * FastOpen it's the seq#
  97. * after data-in-SYN.
  98. */
  99. + u8 saw_mpc:1;
  100. };
  101. static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
  102. @@ -130,6 +181,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
  103. return (struct tcp_request_sock *)req;
  104. }
  105. +struct tcp_md5sig_key;
  106. +
  107. struct tcp_sock {
  108. /* inet_connection_sock has to be the first member of tcp_sock */
  109. struct inet_connection_sock inet_conn;
  110. @@ -323,6 +376,45 @@ struct tcp_sock {
  111. * socket. Used to retransmit SYNACKs etc.
  112. */
  113. struct request_sock *fastopen_rsk;
  114. +
  115. +
  116. + struct mptcp_cb *mpcb;
  117. + struct sock *meta_sk;
  118. + /* We keep these flags even if CONFIG_MPTCP is not checked, because
  119. + * it allows checking MPTCP capability just by checking the mpc flag,
  120. + * rather than adding ifdefs everywhere.
  121. + */
  122. + u16 mpc:1, /* Other end is multipath capable */
  123. + inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
  124. + send_mp_fclose:1,
  125. + request_mptcp:1, /* Did we send out an MP_CAPABLE?
  126. + * (this speeds up mptcp_doit() in tcp_recvmsg)
  127. + */
  128. + mptcp_enabled:1, /* Is MPTCP enabled from the application ? */
  129. + pf:1, /* Potentially Failed state: when this flag is set, we
  130. + * stop using the subflow
  131. + */
  132. + mp_killed:1, /* Killed with a tcp_done in mptcp? */
  133. + was_meta_sk:1, /* This was a meta sk (in case of reuse) */
  134. + close_it:1, /* Must close socket in mptcp_data_ready? */
  135. + closing:1;
  136. + struct mptcp_tcp_sock *mptcp;
  137. +#ifdef CONFIG_MPTCP
  138. + struct hlist_nulls_node tk_table;
  139. + u32 mptcp_loc_token;
  140. + u64 mptcp_loc_key;
  141. +#endif /* CONFIG_MPTCP */
  142. +
  143. + /* Functions that depend on the value of the mpc flag */
  144. + u32 (*__select_window)(struct sock *sk);
  145. + u16 (*select_window)(struct sock *sk);
  146. + void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,
  147. + __u32 *window_clamp, int wscale_ok,
  148. + __u8 *rcv_wscale, __u32 init_rcv_wnd,
  149. + const struct sock *sk);
  150. + void (*init_buffer_space)(struct sock *sk);
  151. + void (*set_rto)(struct sock *sk);
  152. + bool (*should_expand_sndbuf)(const struct sock *sk);
  153. };
  154. enum tsq_flags {
  155. @@ -334,6 +426,8 @@ enum tsq_flags {
  156. TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
  157. * tcp_v{4|6}_mtu_reduced()
  158. */
  159. + MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */
  160. + MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
  161. };
  162. static inline struct tcp_sock *tcp_sk(const struct sock *sk)
  163. @@ -352,6 +446,7 @@ struct tcp_timewait_sock {
  164. #ifdef CONFIG_TCP_MD5SIG
  165. struct tcp_md5sig_key *tw_md5_key;
  166. #endif
  167. + struct mptcp_tw *mptcp_tw;
  168. };
  169. static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
  170. diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
  171. index f981ba7..0144c65 100644
  172. --- a/include/net/inet6_connection_sock.h
  173. +++ b/include/net/inet6_connection_sock.h
  174. @@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,
  175. struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,
  176. const struct request_sock *req);
  177. +u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
  178. + const u32 rnd, const u32 synq_hsize);
  179. struct request_sock *inet6_csk_search_req(const struct sock *sk,
  180. struct request_sock ***prevp,
  181. diff --git a/include/net/inet_common.h b/include/net/inet_common.h
  182. index fe7994c..780f229 100644
  183. --- a/include/net/inet_common.h
  184. +++ b/include/net/inet_common.h
  185. @@ -1,6 +1,8 @@
  186. #ifndef _INET_COMMON_H
  187. #define _INET_COMMON_H
  188. +#include <net/sock.h>
  189. +
  190. extern const struct proto_ops inet_stream_ops;
  191. extern const struct proto_ops inet_dgram_ops;
  192. @@ -13,6 +15,8 @@ struct sock;
  193. struct sockaddr;
  194. struct socket;
  195. +int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
  196. +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
  197. int inet_release(struct socket *sock);
  198. int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
  199. int addr_len, int flags);
  200. diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
  201. index c55aeed..84d1927 100644
  202. --- a/include/net/inet_connection_sock.h
  203. +++ b/include/net/inet_connection_sock.h
  204. @@ -243,6 +243,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
  205. struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
  206. +u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
  207. + const u32 synq_hsize);
  208. +
  209. struct request_sock *inet_csk_search_req(const struct sock *sk,
  210. struct request_sock ***prevp,
  211. const __be16 rport,
  212. diff --git a/include/net/mptcp.h b/include/net/mptcp.h
  213. new file mode 100644
  214. index 0000000..6454535
  215. --- /dev/null
  216. +++ b/include/net/mptcp.h
  217. @@ -0,0 +1,1471 @@
  218. +/*
  219. + * MPTCP implementation
  220. + *
  221. + * Initial Design & Implementation:
  222. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  223. + *
  224. + * Current Maintainer & Author:
  225. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  226. + *
  227. + * Additional authors:
  228. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  229. + * Gregory Detal <gregory.detal@uclouvain.be>
  230. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  231. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  232. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  233. + * Andreas Ripke <ripke@neclab.eu>
  234. + * Vlad Dogaru <vlad.dogaru@intel.com>
  235. + * Octavian Purdila <octavian.purdila@intel.com>
  236. + * John Ronan <jronan@tssg.org>
  237. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  238. + * Brandon Heller <brandonh@stanford.edu>
  239. + *
  240. + *
  241. + * This program is free software; you can redistribute it and/or
  242. + * modify it under the terms of the GNU General Public License
  243. + * as published by the Free Software Foundation; either version
  244. + * 2 of the License, or (at your option) any later version.
  245. + */
  246. +
  247. +#ifndef _MPTCP_H
  248. +#define _MPTCP_H
  249. +
  250. +#include <linux/inetdevice.h>
  251. +#include <linux/ipv6.h>
  252. +#include <linux/list.h>
  253. +#include <linux/net.h>
  254. +#include <linux/netpoll.h>
  255. +#include <linux/skbuff.h>
  256. +#include <linux/socket.h>
  257. +#include <linux/tcp.h>
  258. +#include <linux/kernel.h>
  259. +
  260. +#include <asm/byteorder.h>
  261. +#include <asm/unaligned.h>
  262. +#include <crypto/hash.h>
  263. +#include <net/tcp.h>
  264. +
  265. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  266. + #define ntohll(x) be64_to_cpu(x)
  267. + #define htonll(x) cpu_to_be64(x)
  268. +#elif defined(__BIG_ENDIAN_BITFIELD)
  269. + #define ntohll(x) (x)
  270. + #define htonll(x) (x)
  271. +#endif
  272. +
  273. +/* Max number of local or remote addresses we can store.
  274. + * When changing, see the bitfield below in mptcp_loc4/6. */
  275. +#define MPTCP_MAX_ADDR 8
  276. +
  277. +#define MPTCP_SUBFLOW_RETRY_DELAY 1000
  278. +
  279. +struct mptcp_loc4 {
  280. + u8 loc4_id;
  281. + u8 low_prio:1;
  282. + struct in_addr addr;
  283. +};
  284. +
  285. +struct mptcp_rem4 {
  286. + u8 rem4_id;
  287. + u8 bitfield;
  288. + u8 retry_bitfield;
  289. + __be16 port;
  290. + struct in_addr addr;
  291. +};
  292. +
  293. +struct mptcp_loc6 {
  294. + u8 loc6_id;
  295. + u8 low_prio:1;
  296. + struct in6_addr addr;
  297. +};
  298. +
  299. +struct mptcp_rem6 {
  300. + u8 rem6_id;
  301. + u8 bitfield;
  302. + u8 retry_bitfield;
  303. + __be16 port;
  304. + struct in6_addr addr;
  305. +};
  306. +
  307. +struct mptcp_request_sock {
  308. + struct tcp_request_sock req;
  309. + struct mptcp_cb *mpcb;
  310. + /* Collision list in the tuple hashtable. We need to find
  311. + * the req sock when receiving the third msg of the 3-way handshake,
  312. + * since that one does not contain the token. If this makes
  313. + * the request sock too long, we can use kmalloc'ed specific entries for
  314. + * that tuple hashtable. At the moment, though, I extend the
  315. + * request_sock.
  316. + */
  317. + struct list_head collide_tuple;
  318. + struct hlist_nulls_node collide_tk;
  319. + u32 mptcp_rem_nonce;
  320. + u32 mptcp_loc_token;
  321. + u64 mptcp_loc_key;
  322. + u64 mptcp_rem_key;
  323. + u64 mptcp_hash_tmac;
  324. + u32 mptcp_loc_nonce;
  325. + u8 loc_id;
  326. + u8 rem_id; /* Address-id in the MP_JOIN */
  327. + u8 dss_csum:1,
  328. + low_prio:1;
  329. +};
  330. +
  331. +struct mptcp_options_received {
  332. + u16 saw_mpc:1,
  333. + dss_csum:1,
  334. + drop_me:1,
  335. +
  336. + is_mp_join:1,
  337. + join_ack:1,
  338. +
  339. + saw_low_prio:2, /* 0x1 - low-prio set for this subflow
  340. + * 0x2 - low-prio set for another subflow
  341. + */
  342. + low_prio:1,
  343. +
  344. + saw_add_addr:2, /* Saw at least one add_addr option:
  345. + * 0x1: IPv4 - 0x2: IPv6
  346. + */
  347. + more_add_addr:1, /* Saw one more add-addr. */
  348. +
  349. + saw_rem_addr:1, /* Saw at least one rem_addr option */
  350. + more_rem_addr:1, /* Saw one more rem-addr. */
  351. +
  352. + mp_fail:1,
  353. + mp_fclose:1;
  354. + u8 rem_id; /* Address-id in the MP_JOIN */
  355. + u8 prio_addr_id; /* Address-id in the MP_PRIO */
  356. +
  357. + const unsigned char *add_addr_ptr; /* Pointer to add-address option */
  358. + const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
  359. +
  360. + u32 data_ack;
  361. + u32 data_seq;
  362. + u16 data_len;
  363. +
  364. + u32 mptcp_rem_token;/* Remote token */
  365. +
  366. + /* Key inside the option (from mp_capable or fast_close) */
  367. + u64 mptcp_key;
  368. +
  369. + u32 mptcp_recv_nonce;
  370. + u64 mptcp_recv_tmac;
  371. + u8 mptcp_recv_mac[20];
  372. +};
  373. +
  374. +struct mptcp_tcp_sock {
  375. + struct tcp_sock *next; /* Next subflow socket */
  376. + struct list_head cb_list;
  377. + struct mptcp_options_received rx_opt;
  378. +
  379. + /* Those three fields record the current mapping */
  380. + u64 map_data_seq;
  381. + u32 map_subseq;
  382. + u16 map_data_len;
  383. + u16 slave_sk:1,
  384. + fully_established:1,
  385. + establish_increased:1,
  386. + second_packet:1,
  387. + attached:1,
  388. + send_mp_fail:1,
  389. + include_mpc:1,
  390. + mapping_present:1,
  391. + map_data_fin:1,
  392. + low_prio:1, /* use this socket as backup */
  393. + rcv_low_prio:1, /* Peer sent low-prio option to us */
  394. + send_mp_prio:1, /* Trigger to send mp_prio on this socket */
  395. + pre_established:1; /* State between sending 3rd ACK and
  396. + * receiving the fourth ack of new subflows.
  397. + */
  398. +
  399. + /* isn: needed to translate abs to relative subflow seqnums */
  400. + u32 snt_isn;
  401. + u32 rcv_isn;
  402. + u32 last_data_seq;
  403. + u8 path_index;
  404. + u8 loc_id;
  405. + u8 rem_id;
  406. +
  407. + u32 last_rbuf_opti; /* Timestamp of last rbuf optimization */
  408. + unsigned int sent_pkts;
  409. +
  410. + struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified
  411. + * skb in the ofo-queue.
  412. + */
  413. +
  414. + int init_rcv_wnd;
  415. + u32 infinite_cutoff_seq;
  416. + struct delayed_work work;
  417. + u32 mptcp_loc_nonce;
  418. + struct tcp_sock *tp; /* Where is my daddy? */
  419. + u32 last_end_data_seq;
  420. +
  421. + /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
  422. + struct timer_list mptcp_ack_timer;
  423. +
  424. + /* HMAC of the third ack */
  425. + char sender_mac[20];
  426. +};
  427. +
  428. +struct mptcp_tw {
  429. + struct list_head list;
  430. + u64 loc_key;
  431. + u64 rcv_nxt;
  432. + struct mptcp_cb __rcu *mpcb;
  433. + u8 meta_tw:1,
  434. + in_list:1;
  435. +};
  436. +
  437. +#define MPTCP_PM_NAME_MAX 16
  438. +struct mptcp_pm_ops {
  439. + struct list_head list;
  440. +
  441. + /* Signal the creation of a new MPTCP-session. */
  442. + void (*new_session)(struct sock *meta_sk, int index);
  443. + void (*release_sock)(struct sock *meta_sk);
  444. + void (*fully_established)(struct sock *meta_sk);
  445. + void (*new_remote_address)(struct sock *meta_sk);
  446. + int (*get_local_index)(sa_family_t family, union inet_addr *addr,
  447. + struct net *net);
  448. + int (*get_local_id)(sa_family_t family, union inet_addr *addr,
  449. + struct net *net);
  450. + void (*addr_signal)(struct sock *sk, unsigned *size,
  451. + struct tcp_out_options *opts, struct sk_buff *skb);
  452. +
  453. + char name[MPTCP_PM_NAME_MAX];
  454. + struct module *owner;
  455. +};
  456. +
  457. +struct mptcp_cb {
  458. + struct sock *meta_sk;
  459. +
  460. + /* list of sockets in this multipath connection */
  461. + struct tcp_sock *connection_list;
  462. + /* list of sockets that need a call to release_cb */
  463. + struct list_head callback_list;
  464. +
  465. + spinlock_t tw_lock;
  466. + struct list_head tw_list;
  467. + unsigned char mptw_state;
  468. +
  469. + atomic_t mpcb_refcnt;
  470. +
  471. + /* High-order bits of 64-bit sequence numbers */
  472. + u32 snd_high_order[2];
  473. + u32 rcv_high_order[2];
  474. +
  475. + u16 send_infinite_mapping:1,
  476. + in_time_wait:1,
  477. + list_rcvd:1, /* XXX TO REMOVE */
  478. + dss_csum:1,
  479. + server_side:1,
  480. + infinite_mapping_rcv:1,
  481. + infinite_mapping_snd:1,
  482. + dfin_combined:1, /* Was the DFIN combined with subflow-fin? */
  483. + passive_close:1,
  484. + snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
  485. + rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
  486. +
  487. + /* socket count in this connection */
  488. + u8 cnt_subflows;
  489. + u8 cnt_established;
  490. +
  491. + u32 noneligible; /* Path mask of temporarily non
  492. + * eligible subflows by the scheduler
  493. + */
  494. +
  495. + struct sk_buff_head reinject_queue;
  496. +
  497. + u8 dfin_path_index;
  498. +
  499. +#define MPTCP_PM_SIZE 320
  500. + u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
  501. + struct mptcp_pm_ops *pm_ops;
  502. +
  503. + /* Mutex needed, because otherwise mptcp_close will complain that the
  504. + * socket is owned by the user.
  505. + * E.g., mptcp_sub_close_wq is taking the meta-lock.
  506. + */
  507. + struct mutex mpcb_mutex;
  508. +
  509. + /* Master socket, also part of the connection_list, this
  510. + * socket is the one that the application sees.
  511. + */
  512. + struct sock *master_sk;
  513. +
  514. + u64 csum_cutoff_seq;
  515. +
  516. + __u64 mptcp_loc_key;
  517. + __u32 mptcp_loc_token;
  518. + __u64 mptcp_rem_key;
  519. + __u32 mptcp_rem_token;
  520. +
  521. + /* Create a new subflow - necessary because the meta-sk may be IPv4, but
  522. + * the new subflow can be IPv6
  523. + */
  524. + struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
  525. + struct request_sock *req,
  526. + struct dst_entry *dst);
  527. +
  528. + /* Remote addresses */
  529. + struct mptcp_rem4 remaddr4[MPTCP_MAX_ADDR];
  530. + u8 rem4_bits;
  531. +
  532. + struct mptcp_rem6 remaddr6[MPTCP_MAX_ADDR];
  533. + u8 rem6_bits;
  534. +
  535. + u32 path_index_bits;
  536. + /* Next pi to pick up in case a new path becomes available */
  537. + u8 next_path_index;
  538. +
  539. + /* Original snd/rcvbuf of the initial subflow.
  540. + * Used for the new subflows on the server-side to allow correct
  541. + * autotuning
  542. + */
  543. + int orig_sk_rcvbuf;
  544. + int orig_sk_sndbuf;
  545. + u32 orig_window_clamp;
  546. +};
  547. +
  548. +#define MPTCP_SUB_CAPABLE 0
  549. +#define MPTCP_SUB_LEN_CAPABLE_SYN 12
  550. +#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12
  551. +#define MPTCP_SUB_LEN_CAPABLE_ACK 20
  552. +#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20
  553. +
  554. +#define MPTCP_SUB_JOIN 1
  555. +#define MPTCP_SUB_LEN_JOIN_SYN 12
  556. +#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12
  557. +#define MPTCP_SUB_LEN_JOIN_SYNACK 16
  558. +#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
  559. +#define MPTCP_SUB_LEN_JOIN_ACK 24
  560. +#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24
  561. +
  562. +#define MPTCP_SUB_DSS 2
  563. +#define MPTCP_SUB_LEN_DSS 4
  564. +#define MPTCP_SUB_LEN_DSS_ALIGN 4
  565. +
  566. +/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
  567. + * as they are part of the DSS-option.
  568. + * To get the total length, just add the different options together.
  569. + */
  570. +#define MPTCP_SUB_LEN_SEQ 10
  571. +#define MPTCP_SUB_LEN_SEQ_CSUM 12
  572. +#define MPTCP_SUB_LEN_SEQ_ALIGN 12
  573. +
  574. +#define MPTCP_SUB_LEN_SEQ_64 14
  575. +#define MPTCP_SUB_LEN_SEQ_CSUM_64 16
  576. +#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16
  577. +
  578. +#define MPTCP_SUB_LEN_ACK 4
  579. +#define MPTCP_SUB_LEN_ACK_ALIGN 4
  580. +
  581. +#define MPTCP_SUB_LEN_ACK_64 8
  582. +#define MPTCP_SUB_LEN_ACK_64_ALIGN 8
  583. +
  584. +/* This is the "default" option-length we will send out most often.
  585. + * MPTCP DSS-header
  586. + * 32-bit data sequence number
  587. + * 32-bit data ack
  588. + *
  589. + * It is necessary to calculate the effective MSS we will be using when
  590. + * sending data.
  591. + */
  592. +#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \
  593. + MPTCP_SUB_LEN_SEQ_ALIGN + \
  594. + MPTCP_SUB_LEN_ACK_ALIGN)
  595. +
  596. +#define MPTCP_SUB_ADD_ADDR 3
  597. +#define MPTCP_SUB_LEN_ADD_ADDR4 8
  598. +#define MPTCP_SUB_LEN_ADD_ADDR6 20
  599. +#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8
  600. +#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20
  601. +
  602. +#define MPTCP_SUB_REMOVE_ADDR 4
  603. +#define MPTCP_SUB_LEN_REMOVE_ADDR 4
  604. +
  605. +#define MPTCP_SUB_PRIO 5
  606. +#define MPTCP_SUB_LEN_PRIO 3
  607. +#define MPTCP_SUB_LEN_PRIO_ADDR 4
  608. +#define MPTCP_SUB_LEN_PRIO_ALIGN 4
  609. +
  610. +#define MPTCP_SUB_FAIL 6
  611. +#define MPTCP_SUB_LEN_FAIL 12
  612. +#define MPTCP_SUB_LEN_FAIL_ALIGN 12
  613. +
  614. +#define MPTCP_SUB_FCLOSE 7
  615. +#define MPTCP_SUB_LEN_FCLOSE 12
  616. +#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12
  617. +
  618. +
  619. +#define OPTION_MPTCP (1 << 5)
  620. +
  621. +static inline void reset_mpc(struct tcp_sock *tp)
  622. +{
  623. + tp->mpc = 0;
  624. +
  625. + tp->__select_window = __tcp_select_window;
  626. + tp->select_window = tcp_select_window;
  627. + tp->select_initial_window = tcp_select_initial_window;
  628. + tp->init_buffer_space = tcp_init_buffer_space;
  629. + tp->set_rto = tcp_set_rto;
  630. + tp->should_expand_sndbuf = tcp_should_expand_sndbuf;
  631. +}
  632. +
  633. +/* Initializes MPTCP flags in tcp_sock (and other tcp_sock members that depend
  634. + * on those flags).
  635. + */
  636. +static inline void mptcp_init_tcp_sock(struct tcp_sock *tp)
  637. +{
  638. + reset_mpc(tp);
  639. +}
  640. +
  641. +#ifdef CONFIG_MPTCP
  642. +
  643. +/* Used for checking if the mptcp initialization has been successful */
  644. +extern bool mptcp_init_failed;
  645. +
  646. +/* MPTCP options */
  647. +#define OPTION_TYPE_SYN (1 << 0)
  648. +#define OPTION_TYPE_SYNACK (1 << 1)
  649. +#define OPTION_TYPE_ACK (1 << 2)
  650. +#define OPTION_MP_CAPABLE (1 << 3)
  651. +#define OPTION_DATA_ACK (1 << 4)
  652. +#define OPTION_ADD_ADDR (1 << 5)
  653. +#define OPTION_MP_JOIN (1 << 6)
  654. +#define OPTION_MP_FAIL (1 << 7)
  655. +#define OPTION_MP_FCLOSE (1 << 8)
  656. +#define OPTION_REMOVE_ADDR (1 << 9)
  657. +#define OPTION_MP_PRIO (1 << 10)
  658. +
  659. +/* MPTCP flags */
  660. +#define MPTCPHDR_ACK 0x01
  661. +#define MPTCPHDR_SEQ 0x02
  662. +#define MPTCPHDR_FIN 0x04
  663. +#define MPTCPHDR_INF 0x08
  664. +#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number */
  665. +#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */
  666. +#define MPTCPHDR_SEQ64_INDEX 0x40 /* Index of seq in mpcb->snd_high_order */
  667. +#define MPTCPHDR_DSS_CSUM 0x80
  668. +
  669. +/* It is impossible, that all 8 bits of mptcp_flags are set to 1 with the above
  670. + * Thus, defining MPTCPHDR_JOIN as 0xFF is safe.
  671. + */
  672. +#define MPTCPHDR_JOIN 0xFF
  673. +
  674. +struct mptcp_option {
  675. + __u8 kind;
  676. + __u8 len;
  677. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  678. + __u8 ver:4,
  679. + sub:4;
  680. +#elif defined(__BIG_ENDIAN_BITFIELD)
  681. + __u8 sub:4,
  682. + ver:4;
  683. +#else
  684. +#error "Adjust your <asm/byteorder.h> defines"
  685. +#endif
  686. +};
  687. +
  688. +struct mp_capable {
  689. + __u8 kind;
  690. + __u8 len;
  691. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  692. + __u8 ver:4,
  693. + sub:4;
  694. + __u8 h:1,
  695. + rsv:5,
  696. + b:1,
  697. + a:1;
  698. +#elif defined(__BIG_ENDIAN_BITFIELD)
  699. + __u8 sub:4,
  700. + ver:4;
  701. + __u8 a:1,
  702. + b:1,
  703. + rsv:5,
  704. + h:1;
  705. +#else
  706. +#error "Adjust your <asm/byteorder.h> defines"
  707. +#endif
  708. + __u64 sender_key;
  709. + __u64 receiver_key;
  710. +} __attribute__((__packed__));
  711. +
  712. +struct mp_join {
  713. + __u8 kind;
  714. + __u8 len;
  715. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  716. + __u8 b:1,
  717. + rsv:3,
  718. + sub:4;
  719. +#elif defined(__BIG_ENDIAN_BITFIELD)
  720. + __u8 sub:4,
  721. + rsv:3,
  722. + b:1;
  723. +#else
  724. +#error "Adjust your <asm/byteorder.h> defines"
  725. +#endif
  726. + __u8 addr_id;
  727. + union {
  728. + struct {
  729. + u32 token;
  730. + u32 nonce;
  731. + } syn;
  732. + struct {
  733. + __u64 mac;
  734. + u32 nonce;
  735. + } synack;
  736. + struct {
  737. + __u8 mac[20];
  738. + } ack;
  739. + } u;
  740. +} __attribute__((__packed__));
  741. +
  742. +struct mp_dss {
  743. + __u8 kind;
  744. + __u8 len;
  745. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  746. + __u16 rsv1:4,
  747. + sub:4,
  748. + A:1,
  749. + a:1,
  750. + M:1,
  751. + m:1,
  752. + F:1,
  753. + rsv2:3;
  754. +#elif defined(__BIG_ENDIAN_BITFIELD)
  755. + __u16 sub:4,
  756. + rsv1:4,
  757. + rsv2:3,
  758. + F:1,
  759. + m:1,
  760. + M:1,
  761. + a:1,
  762. + A:1;
  763. +#else
  764. +#error "Adjust your <asm/byteorder.h> defines"
  765. +#endif
  766. +};
  767. +
  768. +struct mp_add_addr {
  769. + __u8 kind;
  770. + __u8 len;
  771. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  772. + __u8 ipver:4,
  773. + sub:4;
  774. +#elif defined(__BIG_ENDIAN_BITFIELD)
  775. + __u8 sub:4,
  776. + ipver:4;
  777. +#else
  778. +#error "Adjust your <asm/byteorder.h> defines"
  779. +#endif
  780. + __u8 addr_id;
  781. + union {
  782. + struct {
  783. + struct in_addr addr;
  784. + __be16 port;
  785. + } v4;
  786. + struct {
  787. + struct in6_addr addr;
  788. + __be16 port;
  789. + } v6;
  790. + } u;
  791. +} __attribute__((__packed__));
  792. +
  793. +struct mp_remove_addr {
  794. + __u8 kind;
  795. + __u8 len;
  796. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  797. + __u8 rsv:4,
  798. + sub:4;
  799. +#elif defined(__BIG_ENDIAN_BITFIELD)
  800. + __u8 sub:4,
  801. + rsv:4;
  802. +#else
  803. +#error "Adjust your <asm/byteorder.h> defines"
  804. +#endif
  805. + /* list of addr_id */
  806. + __u8 addrs_id;
  807. +};
  808. +
  809. +struct mp_fail {
  810. + __u8 kind;
  811. + __u8 len;
  812. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  813. + __u16 rsv1:4,
  814. + sub:4,
  815. + rsv2:8;
  816. +#elif defined(__BIG_ENDIAN_BITFIELD)
  817. + __u16 sub:4,
  818. + rsv1:4,
  819. + rsv2:8;
  820. +#else
  821. +#error "Adjust your <asm/byteorder.h> defines"
  822. +#endif
  823. + __be64 data_seq;
  824. +} __attribute__((__packed__));
  825. +
  826. +struct mp_fclose {
  827. + __u8 kind;
  828. + __u8 len;
  829. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  830. + __u16 rsv1:4,
  831. + sub:4,
  832. + rsv2:8;
  833. +#elif defined(__BIG_ENDIAN_BITFIELD)
  834. + __u16 sub:4,
  835. + rsv1:4,
  836. + rsv2:8;
  837. +#else
  838. +#error "Adjust your <asm/byteorder.h> defines"
  839. +#endif
  840. + __u64 key;
  841. +} __attribute__((__packed__));
  842. +
  843. +struct mp_prio {
  844. + __u8 kind;
  845. + __u8 len;
  846. +#if defined(__LITTLE_ENDIAN_BITFIELD)
  847. + __u8 b:1,
  848. + rsv:3,
  849. + sub:4;
  850. +#elif defined(__BIG_ENDIAN_BITFIELD)
  851. + __u8 sub:4,
  852. + rsv:3,
  853. + b:1;
  854. +#else
  855. +#error "Adjust your <asm/byteorder.h> defines"
  856. +#endif
  857. + __u8 addr_id;
  858. +} __attribute__((__packed__));
  859. +
  860. +static inline int mptcp_sub_len_dss(struct mp_dss *m, int csum)
  861. +{
  862. + return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
  863. +}
  864. +
  865. +#define MPTCP_APP 2
  866. +
  867. +extern int sysctl_mptcp_enabled;
  868. +extern int sysctl_mptcp_checksum;
  869. +extern int sysctl_mptcp_debug;
  870. +extern int sysctl_mptcp_syn_retries;
  871. +
  872. +extern struct workqueue_struct *mptcp_wq;
  873. +
  874. +#define mptcp_debug(fmt, args...) \
  875. + do { \
  876. + if (unlikely(sysctl_mptcp_debug)) \
  877. + pr_err(__FILE__ ": " fmt, ##args); \
  878. + } while (0)
  879. +
  880. +/* Iterates over all subflows */
  881. +#define mptcp_for_each_tp(mpcb, tp) \
  882. + for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
  883. +
  884. +#define mptcp_for_each_sk(mpcb, sk) \
  885. + for ((sk) = (struct sock *)(mpcb)->connection_list; \
  886. + sk; \
  887. + sk = (struct sock *)tcp_sk(sk)->mptcp->next)
  888. +
  889. +#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \
  890. + for (__sk = (struct sock *)(__mpcb)->connection_list, \
  891. + __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
  892. + __sk; \
  893. + __sk = __temp, \
  894. + __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
  895. +
  896. +/* Iterates over all bit set to 1 in a bitset */
  897. +#define mptcp_for_each_bit_set(b, i) \
  898. + for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
  899. +
  900. +#define mptcp_for_each_bit_unset(b, i) \
  901. + mptcp_for_each_bit_set(~b, i)
  902. +
  903. +extern struct lock_class_key meta_key;
  904. +extern struct lock_class_key meta_slock_key;
  905. +extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];
  906. +
  907. +/* This is needed to ensure that two subsequent key-generation result in
  908. + * different keys if the IPs and ports are the same.
  909. + */
  910. +extern u32 mptcp_key_seed;
  911. +
  912. +#define MPTCP_HASH_SIZE 1024
  913. +
  914. +extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
  915. +
  916. +/* This second hashtable is needed to retrieve request socks
  917. + * created as a result of a join request. While the SYN contains
  918. + * the token, the final ack does not, so we need a separate hashtable
  919. + * to retrieve the mpcb.
  920. + */
  921. +extern struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
  922. +extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
  923. +
  924. +/* Lock, protecting the two hash-tables that hold the token. Namely,
  925. + * mptcp_reqsk_tk_htb and tk_hashtable
  926. + */
  927. +extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */
  928. +
  929. +void mptcp_data_ready(struct sock *sk, int bytes);
  930. +void mptcp_write_space(struct sock *sk);
  931. +
  932. +void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb,
  933. + struct sock *sk);
  934. +void mptcp_ofo_queue(struct sock *meta_sk);
  935. +void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);
  936. +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
  937. +int mptcp_alloc_mpcb(struct sock *master_sk, __u64 remote_key, u32 window);
  938. +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
  939. + gfp_t flags);
  940. +void mptcp_del_sock(struct sock *sk);
  941. +void mptcp_update_metasocket(struct sock *sock, struct sock *meta_sk);
  942. +void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
  943. +void mptcp_update_sndbuf(struct mptcp_cb *mpcb);
  944. +struct sk_buff *mptcp_next_segment(struct sock *sk, int *reinject);
  945. +void mptcp_send_fin(struct sock *meta_sk);
  946. +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
  947. +int mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  948. + int push_one, gfp_t gfp);
  949. +void mptcp_parse_options(const uint8_t *ptr, int opsize,
  950. + struct tcp_options_received *opt_rx,
  951. + struct mptcp_options_received *mopt,
  952. + const struct sk_buff *skb);
  953. +void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts,
  954. + unsigned *remaining);
  955. +void mptcp_synack_options(struct request_sock *req,
  956. + struct tcp_out_options *opts,
  957. + unsigned *remaining);
  958. +void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
  959. + struct tcp_out_options *opts, unsigned *size);
  960. +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
  961. + struct tcp_out_options *opts,
  962. + struct sk_buff *skb);
  963. +void mptcp_close(struct sock *meta_sk, long timeout);
  964. +int mptcp_doit(struct sock *sk);
  965. +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);
  966. +int mptcp_check_req_master(struct sock *sk, struct sock *child,
  967. + struct request_sock *req,
  968. + struct request_sock **prev,
  969. + struct mptcp_options_received *mopt);
  970. +struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,
  971. + struct request_sock *req,
  972. + struct request_sock **prev,
  973. + struct mptcp_options_received *mopt);
  974. +u32 __mptcp_select_window(struct sock *sk);
  975. +void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
  976. + __u32 *window_clamp, int wscale_ok,
  977. + __u8 *rcv_wscale, __u32 init_rcv_wnd,
  978. + const struct sock *sk);
  979. +unsigned int mptcp_current_mss(struct sock *meta_sk);
  980. +int mptcp_select_size(const struct sock *meta_sk, bool sg);
  981. +void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
  982. +void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
  983. + u32 *hash_out);
  984. +void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk);
  985. +void mptcp_fin(struct sock *meta_sk);
  986. +void mptcp_retransmit_timer(struct sock *meta_sk);
  987. +int mptcp_write_wakeup(struct sock *meta_sk);
  988. +void mptcp_sub_close_wq(struct work_struct *work);
  989. +void mptcp_sub_close(struct sock *sk, unsigned long delay);
  990. +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied);
  991. +void mptcp_fallback_meta_sk(struct sock *meta_sk);
  992. +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
  993. +struct sock *mptcp_sk_clone(const struct sock *sk, int family, const gfp_t priority);
  994. +void mptcp_ack_handler(unsigned long);
  995. +int mptcp_check_rtt(const struct tcp_sock *tp, int time);
  996. +int mptcp_check_snd_buf(const struct tcp_sock *tp);
  997. +int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb);
  998. +void __init mptcp_init(void);
  999. +int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);
  1000. +int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  1001. + unsigned int mss_now, int reinject);
  1002. +int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  1003. + unsigned int mss_now, gfp_t gfp, int reinject);
  1004. +void mptcp_destroy_sock(struct sock *sk);
  1005. +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
  1006. + struct sk_buff *skb,
  1007. + struct mptcp_options_received *mopt);
  1008. +unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now,
  1009. + int large_allowed);
  1010. +int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw);
  1011. +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
  1012. +void mptcp_update_tw_socks(const struct tcp_sock *tp, int state);
  1013. +void mptcp_disconnect(struct sock *sk);
  1014. +bool mptcp_should_expand_sndbuf(const struct sock *sk);
  1015. +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
  1016. +void mptcp_tsq_flags(struct sock *sk);
  1017. +void mptcp_tsq_sub_deferred(struct sock *meta_sk);
  1018. +struct mp_join *mptcp_find_join(struct sk_buff *skb);
  1019. +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
  1020. +void mptcp_hash_remove(struct tcp_sock *meta_tp);
  1021. +struct sock *mptcp_hash_find(struct net *net, u32 token);
  1022. +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
  1023. +int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt,
  1024. + struct tcp_options_received *tmp_opt, struct net *net);
  1025. +void mptcp_reqsk_destructor(struct request_sock *req);
  1026. +void mptcp_reqsk_new_mptcp(struct request_sock *req,
  1027. + const struct tcp_options_received *rx_opt,
  1028. + const struct mptcp_options_received *mopt,
  1029. + const struct sk_buff *skb);
  1030. +int mptcp_check_req(struct sk_buff *skb, struct net *net);
  1031. +void mptcp_connect_init(struct sock *sk);
  1032. +void mptcp_sub_force_close(struct sock *sk);
  1033. +int mptcp_sub_len_remove_addr_align(u16 bitfield);
  1034. +void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
  1035. + const struct sk_buff *skb);
  1036. +void mptcp_init_buffer_space(struct sock *sk);
  1037. +
  1038. +/* MPTCP-path-manager registration/initialization functions */
  1039. +int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
  1040. +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
  1041. +void mptcp_init_path_manager(struct mptcp_cb *mpcb);
  1042. +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
  1043. +void mptcp_fallback_default(struct mptcp_cb *mpcb);
  1044. +void mptcp_get_default_path_manager(char *name);
  1045. +int mptcp_set_default_path_manager(const char *name);
  1046. +extern struct mptcp_pm_ops mptcp_pm_default;
  1047. +
  1048. +static inline
  1049. +struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
  1050. +{
  1051. + return (struct mptcp_request_sock *)req;
  1052. +}
  1053. +
  1054. +static inline
  1055. +struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
  1056. +{
  1057. + return (struct request_sock *)req;
  1058. +}
  1059. +
  1060. +static inline bool mptcp_can_sendpage(struct sock *sk)
  1061. +{
  1062. + struct sock *sk_it;
  1063. +
  1064. + if (tcp_sk(sk)->mpcb->dss_csum)
  1065. + return false;
  1066. +
  1067. + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
  1068. + if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
  1069. + !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))
  1070. + return false;
  1071. + }
  1072. +
  1073. + return true;
  1074. +}
  1075. +
  1076. +static inline void mptcp_push_pending_frames(struct sock *meta_sk)
  1077. +{
  1078. + if (mptcp_next_segment(meta_sk, NULL)) {
  1079. + struct tcp_sock *tp = tcp_sk(meta_sk);
  1080. +
  1081. + /* We don't care about the MSS, because it will be set in
  1082. + * mptcp_write_xmit.
  1083. + */
  1084. + __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
  1085. + }
  1086. +}
  1087. +
  1088. +static inline void mptcp_send_reset(struct sock *sk)
  1089. +{
  1090. + tcp_send_active_reset(sk, GFP_ATOMIC);
  1091. + mptcp_sub_force_close(sk);
  1092. +}
  1093. +
  1094. +static inline int mptcp_is_data_seq(const struct sk_buff *skb)
  1095. +{
  1096. + return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
  1097. +}
  1098. +
  1099. +static inline int mptcp_is_data_fin(const struct sk_buff *skb)
  1100. +{
  1101. + return mptcp_is_data_seq(skb) &&
  1102. + (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN);
  1103. +}
  1104. +
  1105. +/* Is it a data-fin while in infinite mapping mode?
  1106. + * In infinite mode, a subflow-fin is in fact a data-fin.
  1107. + */
  1108. +static inline int mptcp_is_data_fin2(const struct sk_buff *skb,
  1109. + const struct tcp_sock *tp)
  1110. +{
  1111. + return mptcp_is_data_fin(skb) ||
  1112. + (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);
  1113. +}
  1114. +
  1115. +static inline void mptcp_skb_entail_init(const struct tcp_sock *tp,
  1116. + struct sk_buff *skb)
  1117. +{
  1118. + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_SEQ;
  1119. +}
  1120. +
  1121. +static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
  1122. +{
  1123. + u64 data_seq_high = (u32)(data_seq >> 32);
  1124. +
  1125. + if (mpcb->rcv_high_order[0] == data_seq_high)
  1126. + return 0;
  1127. + else if (mpcb->rcv_high_order[1] == data_seq_high)
  1128. + return MPTCPHDR_SEQ64_INDEX;
  1129. + else
  1130. + return MPTCPHDR_SEQ64_OFO;
  1131. +}
  1132. +
  1133. +/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
  1134. + * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
  1135. + */
  1136. +static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
  1137. + u32 *data_seq,
  1138. + struct mptcp_cb *mpcb)
  1139. +{
  1140. + __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
  1141. +
  1142. + if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
  1143. + u64 data_seq64 = get_unaligned_be64(ptr);
  1144. +
  1145. + if (mpcb)
  1146. + TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
  1147. +
  1148. + *data_seq = (u32)data_seq64 ;
  1149. + ptr++;
  1150. + } else {
  1151. + *data_seq = get_unaligned_be32(ptr);
  1152. + }
  1153. +
  1154. + return ptr;
  1155. +}
  1156. +
  1157. +static inline struct sock *mptcp_meta_sk(const struct sock *sk)
  1158. +{
  1159. + return tcp_sk(sk)->meta_sk;
  1160. +}
  1161. +
  1162. +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
  1163. +{
  1164. + return tcp_sk(tp->meta_sk);
  1165. +}
  1166. +
  1167. +static inline int is_meta_tp(const struct tcp_sock *tp)
  1168. +{
  1169. + return tp->mpcb && mptcp_meta_tp(tp) == tp;
  1170. +}
  1171. +
  1172. +static inline int is_meta_sk(const struct sock *sk)
  1173. +{
  1174. + return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
  1175. + tcp_sk(sk)->mpc && mptcp_meta_sk(sk) == sk;
  1176. +}
  1177. +
  1178. +static inline int is_master_tp(const struct tcp_sock *tp)
  1179. +{
  1180. + return !tp->mpc || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
  1181. +}
  1182. +
  1183. +static inline void mptcp_hash_request_remove(struct request_sock *req)
  1184. +{
  1185. + int in_softirq = 0;
  1186. +
  1187. + if (list_empty(&mptcp_rsk(req)->collide_tuple))
  1188. + return;
  1189. +
  1190. + if (in_softirq()) {
  1191. + spin_lock(&mptcp_reqsk_hlock);
  1192. + in_softirq = 1;
  1193. + } else {
  1194. + spin_lock_bh(&mptcp_reqsk_hlock);
  1195. + }
  1196. +
  1197. + list_del(&mptcp_rsk(req)->collide_tuple);
  1198. +
  1199. + if (in_softirq)
  1200. + spin_unlock(&mptcp_reqsk_hlock);
  1201. + else
  1202. + spin_unlock_bh(&mptcp_reqsk_hlock);
  1203. +}
  1204. +
  1205. +static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
  1206. +{
  1207. + mopt->saw_mpc = 0;
  1208. + mopt->dss_csum = 0;
  1209. + mopt->drop_me = 0;
  1210. +
  1211. + mopt->is_mp_join = 0;
  1212. + mopt->join_ack = 0;
  1213. +
  1214. + mopt->saw_low_prio = 0;
  1215. + mopt->low_prio = 0;
  1216. +
  1217. + mopt->saw_add_addr = 0;
  1218. + mopt->more_add_addr = 0;
  1219. +
  1220. + mopt->saw_rem_addr = 0;
  1221. + mopt->more_rem_addr = 0;
  1222. +
  1223. + mopt->mp_fail = 0;
  1224. + mopt->mp_fclose = 0;
  1225. +}
  1226. +
  1227. +static inline void mptcp_reset_mopt(struct tcp_sock *tp)
  1228. +{
  1229. + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
  1230. +
  1231. + mopt->saw_low_prio = 0;
  1232. + mopt->saw_add_addr = 0;
  1233. + mopt->more_add_addr = 0;
  1234. + mopt->saw_rem_addr = 0;
  1235. + mopt->more_rem_addr = 0;
  1236. + mopt->join_ack = 0;
  1237. + mopt->mp_fail = 0;
  1238. + mopt->mp_fclose = 0;
  1239. +}
  1240. +
  1241. +static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
  1242. + const struct mptcp_cb *mpcb)
  1243. +{
  1244. + return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
  1245. + MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
  1246. +}
  1247. +
  1248. +static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
  1249. + u32 data_seq_32)
  1250. +{
  1251. + return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
  1252. +}
  1253. +
  1254. +static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
  1255. +{
  1256. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  1257. + return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
  1258. + meta_tp->rcv_nxt);
  1259. +}
  1260. +
  1261. +static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
  1262. +{
  1263. + if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
  1264. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  1265. + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
  1266. + mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
  1267. + }
  1268. +}
  1269. +
  1270. +static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
  1271. + u32 old_rcv_nxt)
  1272. +{
  1273. + if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
  1274. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  1275. + mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
  1276. + mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
  1277. + }
  1278. +}
  1279. +
  1280. +static inline int mptcp_sk_can_send(const struct sock *sk)
  1281. +{
  1282. + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
  1283. + !tcp_sk(sk)->mptcp->pre_established;
  1284. +}
  1285. +
  1286. +static inline int mptcp_sk_can_recv(const struct sock *sk)
  1287. +{
  1288. + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCP_FIN_WAIT1 | TCP_FIN_WAIT2);
  1289. +}
  1290. +
  1291. +static inline int mptcp_sk_can_send_ack(const struct sock *sk)
  1292. +{
  1293. + return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
  1294. + TCPF_CLOSE | TCPF_LISTEN)) &&
  1295. + !tcp_sk(sk)->mptcp->pre_established;
  1296. +}
  1297. +
  1298. +/* Only support GSO if all subflows supports it */
  1299. +static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
  1300. +{
  1301. + struct sock *sk;
  1302. +
  1303. + if (tcp_sk(meta_sk)->mpcb->dss_csum)
  1304. + return 0;
  1305. +
  1306. + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
  1307. + if (!mptcp_sk_can_send(sk))
  1308. + continue;
  1309. + if (!sk_can_gso(sk))
  1310. + return false;
  1311. + }
  1312. + return true;
  1313. +}
  1314. +
  1315. +static inline bool mptcp_can_sg(const struct sock *meta_sk)
  1316. +{
  1317. + struct sock *sk;
  1318. +
  1319. + if (tcp_sk(meta_sk)->mpcb->dss_csum)
  1320. + return 0;
  1321. +
  1322. + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
  1323. + if (!mptcp_sk_can_send(sk))
  1324. + continue;
  1325. + if (!(sk->sk_route_caps & NETIF_F_SG))
  1326. + return false;
  1327. + }
  1328. + return true;
  1329. +}
  1330. +
  1331. +static inline void mptcp_set_rto(struct sock *sk)
  1332. +{
  1333. + struct tcp_sock *tp = tcp_sk(sk);
  1334. + struct sock *sk_it;
  1335. + struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
  1336. + __u32 max_rto = 0;
  1337. +
  1338. + /* We are in recovery-phase on the MPTCP-level. Do not update the
  1339. + * RTO, because this would kill exponential backoff.
  1340. + */
  1341. + if (micsk->icsk_retransmits)
  1342. + return;
  1343. +
  1344. + mptcp_for_each_sk(tp->mpcb, sk_it) {
  1345. + if (mptcp_sk_can_send(sk_it) &&
  1346. + inet_csk(sk_it)->icsk_rto > max_rto)
  1347. + max_rto = inet_csk(sk_it)->icsk_rto;
  1348. + }
  1349. + if (max_rto) {
  1350. + micsk->icsk_rto = max_rto << 1;
  1351. +
  1352. + /* A successfull rto-measurement - reset backoff counter */
  1353. + micsk->icsk_backoff = 0;
  1354. + }
  1355. +}
  1356. +
  1357. +static inline int mptcp_sysctl_syn_retries(void)
  1358. +{
  1359. + return sysctl_mptcp_syn_retries;
  1360. +}
  1361. +
  1362. +static inline void mptcp_sub_close_passive(struct sock *sk)
  1363. +{
  1364. + struct sock *meta_sk = mptcp_meta_sk(sk);
  1365. + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
  1366. +
  1367. + /* Only close, if the app did a send-shutdown (passive close), and we
  1368. + * received the data-ack of the data-fin.
  1369. + */
  1370. + if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
  1371. + mptcp_sub_close(sk, 0);
  1372. +}
  1373. +
  1374. +static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
  1375. +{
  1376. + struct tcp_sock *tp = tcp_sk(sk);
  1377. +
  1378. + /* If data has been acknowleged on the meta-level, fully_established
  1379. + * will have been set before and thus we will not fall back to infinite
  1380. + * mapping.
  1381. + */
  1382. + if (likely(tp->mptcp->fully_established))
  1383. + return false;
  1384. +
  1385. + if (!(flag & MPTCP_FLAG_DATA_ACKED))
  1386. + return false;
  1387. +
  1388. + /* Don't fallback twice ;) */
  1389. + if (tp->mpcb->infinite_mapping_snd)
  1390. + return false;
  1391. +
  1392. + pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",
  1393. + __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,
  1394. + &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,
  1395. + __builtin_return_address(0));
  1396. + if (!is_master_tp(tp))
  1397. + return true;
  1398. +
  1399. + tp->mpcb->infinite_mapping_snd = 1;
  1400. + tp->mpcb->infinite_mapping_rcv = 1;
  1401. + tp->mptcp->fully_established = 1;
  1402. +
  1403. + return false;
  1404. +}
  1405. +
  1406. +/* Find the first free index in the bitfield */
  1407. +static inline int __mptcp_find_free_index(u8 bitfield, int j, u8 base)
  1408. +{
  1409. + int i;
  1410. + mptcp_for_each_bit_unset(bitfield >> base, i) {
  1411. + /* We wrapped at the bitfield - try from 0 on */
  1412. + if (i + base >= sizeof(bitfield) * 8) {
  1413. + mptcp_for_each_bit_unset(bitfield, i) {
  1414. + if (i >= sizeof(bitfield) * 8)
  1415. + goto exit;
  1416. +
  1417. + if (i != j)
  1418. + return i;
  1419. + }
  1420. + goto exit;
  1421. + }
  1422. + if (i + base >= sizeof(bitfield) * 8)
  1423. + break;
  1424. +
  1425. + if (i + base != j)
  1426. + return i + base;
  1427. + }
  1428. +exit:
  1429. + return -1;
  1430. +}
  1431. +
  1432. +static inline int mptcp_find_free_index(u8 bitfield)
  1433. +{
  1434. + return __mptcp_find_free_index(bitfield, -1, 0);
  1435. +}
  1436. +
  1437. +/* Find the first index whose bit in the bit-field == 0 */
  1438. +static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
  1439. +{
  1440. + u8 base = mpcb->next_path_index;
  1441. + int i;
  1442. +
  1443. + /* Start at 1, because 0 is reserved for the meta-sk */
  1444. + mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
  1445. + if (i + base < 1)
  1446. + continue;
  1447. + if (i + base >= sizeof(mpcb->path_index_bits) * 8)
  1448. + break;
  1449. + i += base;
  1450. + mpcb->path_index_bits |= (1 << i);
  1451. + mpcb->next_path_index = i + 1;
  1452. + return i;
  1453. + }
  1454. + mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
  1455. + if (i >= sizeof(mpcb->path_index_bits) * 8)
  1456. + break;
  1457. + if (i < 1)
  1458. + continue;
  1459. + mpcb->path_index_bits |= (1 << i);
  1460. + mpcb->next_path_index = i + 1;
  1461. + return i;
  1462. + }
  1463. +
  1464. + return 0;
  1465. +}
  1466. +
  1467. +static inline int mptcp_v6_is_v4_mapped(struct sock *sk)
  1468. +{
  1469. + return sk->sk_family == AF_INET6 &&
  1470. + ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
  1471. +}
  1472. +
  1473. +/* TCP and MPTCP mpc flag-depending functions */
  1474. +u16 mptcp_select_window(struct sock *sk);
  1475. +void mptcp_init_buffer_space(struct sock *sk);
  1476. +void mptcp_tcp_set_rto(struct sock *sk);
  1477. +
  1478. +static inline void set_mpc(struct tcp_sock *tp)
  1479. +{
  1480. + tp->mpc = 1;
  1481. +
  1482. + tp->__select_window = __mptcp_select_window;
  1483. + tp->select_window = mptcp_select_window;
  1484. + tp->select_initial_window = mptcp_select_initial_window;
  1485. + tp->init_buffer_space = mptcp_init_buffer_space;
  1486. + tp->set_rto = mptcp_tcp_set_rto;
  1487. + tp->should_expand_sndbuf = mptcp_should_expand_sndbuf;
  1488. +}
  1489. +
  1490. +#else /* CONFIG_MPTCP */
  1491. +#define mptcp_debug(fmt, args...) \
  1492. + do { \
  1493. + } while (0)
  1494. +
  1495. +/* Without MPTCP, we just do one iteration
  1496. + * over the only socket available. This assumes that
  1497. + * the sk/tp arg is the socket in that case.
  1498. + */
  1499. +#define mptcp_for_each_sk(mpcb, sk)
  1500. +#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
  1501. +
  1502. +static inline int mptcp_is_data_fin(const struct sk_buff *skb)
  1503. +{
  1504. + return 0;
  1505. +}
  1506. +static inline int mptcp_is_data_seq(const struct sk_buff *skb)
  1507. +{
  1508. + return 0;
  1509. +}
  1510. +static inline struct sock *mptcp_meta_sk(const struct sock *sk)
  1511. +{
  1512. + return NULL;
  1513. +}
  1514. +static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
  1515. +{
  1516. + return NULL;
  1517. +}
  1518. +static inline int is_meta_sk(const struct sock *sk)
  1519. +{
  1520. + return 0;
  1521. +}
  1522. +static inline int is_master_tp(const struct tcp_sock *tp)
  1523. +{
  1524. + return 0;
  1525. +}
  1526. +static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}
  1527. +static inline void mptcp_cleanup_rbuf(const struct sock *meta_sk, int copied) {}
  1528. +static inline void mptcp_del_sock(const struct sock *sk) {}
  1529. +static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
  1530. +static inline void mptcp_update_sndbuf(const struct mptcp_cb *mpcb) {}
  1531. +static inline void mptcp_skb_entail_init(const struct tcp_sock *tp,
  1532. + const struct sk_buff *skb) {}
  1533. +static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
  1534. + const struct sock *sk) {}
  1535. +static inline void mptcp_retransmit_timer(const struct sock *meta_sk) {}
  1536. +static inline int mptcp_write_wakeup(struct sock *meta_sk)
  1537. +{
  1538. + return 0;
  1539. +}
  1540. +static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
  1541. +static inline void mptcp_set_rto(const struct sock *sk) {}
  1542. +static inline void mptcp_send_fin(const struct sock *meta_sk) {}
  1543. +static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
  1544. + const struct tcp_options_received *opt_rx,
  1545. + const struct mptcp_options_received *mopt,
  1546. + const struct sk_buff *skb) {}
  1547. +static inline void mptcp_syn_options(struct sock *sk,
  1548. + struct tcp_out_options *opts,
  1549. + unsigned *remaining) {}
  1550. +static inline void mptcp_synack_options(struct request_sock *req,
  1551. + struct tcp_out_options *opts,
  1552. + unsigned *remaining) {}
  1553. +
  1554. +static inline void mptcp_established_options(struct sock *sk,
  1555. + struct sk_buff *skb,
  1556. + struct tcp_out_options *opts,
  1557. + unsigned *size) {}
  1558. +static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
  1559. + struct tcp_out_options *opts,
  1560. + struct sk_buff *skb) {}
  1561. +static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
  1562. +static inline int mptcp_doit(struct sock *sk)
  1563. +{
  1564. + return 0;
  1565. +}
  1566. +static inline int mptcp_check_req_master(const struct sock *sk,
  1567. + const struct sock *child,
  1568. + struct request_sock *req,
  1569. + struct request_sock **prev,
  1570. + const struct mptcp_options_received *mopt)
  1571. +{
  1572. + return 1;
  1573. +}
  1574. +static inline struct sock *mptcp_check_req_child(struct sock *sk,
  1575. + struct sock *child,
  1576. + struct request_sock *req,
  1577. + struct request_sock **prev,
  1578. + struct mptcp_options_received *mopt)
  1579. +{
  1580. + return NULL;
  1581. +}
  1582. +static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
  1583. +{
  1584. + return 0;
  1585. +}
  1586. +static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)
  1587. +{
  1588. + return 0;
  1589. +}
  1590. +static inline void mptcp_sub_close_passive(struct sock *sk) {}
  1591. +static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
  1592. +{
  1593. + return false;
  1594. +}
  1595. +static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
  1596. +static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)
  1597. +{
  1598. + return 0;
  1599. +}
  1600. +static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
  1601. +{
  1602. + return 0;
  1603. +}
  1604. +static inline int mptcp_sysctl_syn_retries(void)
  1605. +{
  1606. + return 0;
  1607. +}
  1608. +static inline void mptcp_send_reset(const struct sock *sk) {}
  1609. +static inline void mptcp_send_active_reset(struct sock *meta_sk,
  1610. + gfp_t priority) {}
  1611. +static inline int mptcp_write_xmit(struct sock *sk, unsigned int mss_now,
  1612. + int nonagle, int push_one, gfp_t gfp)
  1613. +{
  1614. + return 0;
  1615. +}
  1616. +static inline struct sock *mptcp_sk_clone(const struct sock *sk, int family,
  1617. + const gfp_t priority)
  1618. +{
  1619. + return NULL;
  1620. +}
  1621. +static inline int mptcp_handle_options(struct sock *sk,
  1622. + const struct tcphdr *th,
  1623. + struct sk_buff *skb)
  1624. +{
  1625. + return 0;
  1626. +}
  1627. +static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
  1628. +static inline void __init mptcp_init(void) {}
  1629. +static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  1630. +{
  1631. + return 0;
  1632. +}
  1633. +static inline int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  1634. + unsigned int mss_now, int reinject)
  1635. +{
  1636. + return 0;
  1637. +}
  1638. +static inline int mptso_fragment(struct sock *sk, struct sk_buff *skb,
  1639. + unsigned int len, unsigned int mss_now,
  1640. + gfp_t gfp, int reinject)
  1641. +{
  1642. + return 0;
  1643. +}
  1644. +static inline bool mptcp_sk_can_gso(const struct sock *sk)
  1645. +{
  1646. + return false;
  1647. +}
  1648. +static inline bool mptcp_can_sg(const struct sock *meta_sk)
  1649. +{
  1650. + return false;
  1651. +}
  1652. +static inline unsigned int mptcp_xmit_size_goal(struct sock *meta_sk,
  1653. + u32 mss_now, int large_allowed)
  1654. +{
  1655. + return 0;
  1656. +}
  1657. +static inline void mptcp_destroy_sock(struct sock *sk) {}
  1658. +static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
  1659. + struct sock **skptr,
  1660. + struct sk_buff *skb,
  1661. + struct mptcp_options_received *mopt)
  1662. +{
  1663. + return 0;
  1664. +}
  1665. +static inline bool mptcp_can_sendpage(struct sock *sk)
  1666. +{
  1667. + return false;
  1668. +}
  1669. +static inline int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw)
  1670. +{
  1671. + return 0;
  1672. +}
  1673. +static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
  1674. +static inline void mptcp_update_tw_socks(const struct tcp_sock *tp, int state) {}
  1675. +static inline void mptcp_disconnect(struct sock *sk) {}
  1676. +static inline void mptcp_tsq_flags(struct sock *sk) {}
  1677. +static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
  1678. +static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
  1679. +static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}
  1680. +static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,
  1681. + const struct tcp_options_received *rx_opt,
  1682. + const struct mptcp_options_received *mopt,
  1683. + const struct sk_buff *skb) {}
  1684. +static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
  1685. + const struct sk_buff *skb) {}
  1686. +#endif /* CONFIG_MPTCP */
  1687. +
  1688. +#endif /* _MPTCP_H */
  1689. diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h
  1690. new file mode 100644
  1691. index 0000000..047884c
  1692. --- /dev/null
  1693. +++ b/include/net/mptcp_v4.h
  1694. @@ -0,0 +1,69 @@
  1695. +/*
  1696. + * MPTCP implementation
  1697. + *
  1698. + * Initial Design & Implementation:
  1699. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  1700. + *
  1701. + * Current Maintainer & Author:
  1702. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  1703. + *
  1704. + * Additional authors:
  1705. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  1706. + * Gregory Detal <gregory.detal@uclouvain.be>
  1707. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  1708. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  1709. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  1710. + * Andreas Ripke <ripke@neclab.eu>
  1711. + * Vlad Dogaru <vlad.dogaru@intel.com>
  1712. + * Octavian Purdila <octavian.purdila@intel.com>
  1713. + * John Ronan <jronan@tssg.org>
  1714. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  1715. + * Brandon Heller <brandonh@stanford.edu>
  1716. + *
  1717. + *
  1718. + * This program is free software; you can redistribute it and/or
  1719. + * modify it under the terms of the GNU General Public License
  1720. + * as published by the Free Software Foundation; either version
  1721. + * 2 of the License, or (at your option) any later version.
  1722. + */
  1723. +
  1724. +#ifndef MPTCP_V4_H_
  1725. +#define MPTCP_V4_H_
  1726. +
  1727. +
  1728. +#include <linux/in.h>
  1729. +#include <linux/skbuff.h>
  1730. +#include <net/mptcp.h>
  1731. +#include <net/request_sock.h>
  1732. +#include <net/sock.h>
  1733. +
  1734. +extern struct request_sock_ops mptcp_request_sock_ops;
  1735. +
  1736. +#ifdef CONFIG_MPTCP
  1737. +
  1738. +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
  1739. +int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id);
  1740. +int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr,
  1741. + __be16 port, u8 id);
  1742. +void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr, int index);
  1743. +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
  1744. + const __be32 laddr, const struct net *net);
  1745. +int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
  1746. + struct mptcp_rem4 *rem);
  1747. +int mptcp_pm_v4_init(void);
  1748. +void mptcp_pm_v4_undo(void);
  1749. +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
  1750. + u32 seq);
  1751. +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
  1752. +
  1753. +#else
  1754. +
  1755. +static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
  1756. + const struct sk_buff *skb)
  1757. +{
  1758. + return 0;
  1759. +}
  1760. +
  1761. +#endif /* CONFIG_MPTCP */
  1762. +
  1763. +#endif /* MPTCP_V4_H_ */
  1764. diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h
  1765. new file mode 100644
  1766. index 0000000..c303208
  1767. --- /dev/null
  1768. +++ b/include/net/mptcp_v6.h
  1769. @@ -0,0 +1,72 @@
  1770. +/*
  1771. + * MPTCP implementation
  1772. + *
  1773. + * Initial Design & Implementation:
  1774. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  1775. + *
  1776. + * Current Maintainer & Author:
  1777. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  1778. + *
  1779. + * Additional authors:
  1780. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  1781. + * Gregory Detal <gregory.detal@uclouvain.be>
  1782. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  1783. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  1784. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  1785. + * Andreas Ripke <ripke@neclab.eu>
  1786. + * Vlad Dogaru <vlad.dogaru@intel.com>
  1787. + * Octavian Purdila <octavian.purdila@intel.com>
  1788. + * John Ronan <jronan@tssg.org>
  1789. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  1790. + * Brandon Heller <brandonh@stanford.edu>
  1791. + *
  1792. + *
  1793. + * This program is free software; you can redistribute it and/or
  1794. + * modify it under the terms of the GNU General Public License
  1795. + * as published by the Free Software Foundation; either version
  1796. + * 2 of the License, or (at your option) any later version.
  1797. + */
  1798. +
  1799. +#ifndef _MPTCP_V6_H
  1800. +#define _MPTCP_V6_H
  1801. +
  1802. +#include <linux/in6.h>
  1803. +#include <net/if_inet6.h>
  1804. +
  1805. +#include <net/mptcp.h>
  1806. +
  1807. +extern struct request_sock_ops mptcp6_request_sock_ops;
  1808. +extern struct proto mptcpv6_prot;
  1809. +
  1810. +#ifdef CONFIG_MPTCP
  1811. +
  1812. +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
  1813. +int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id);
  1814. +int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr,
  1815. + __be16 port, u8 id);
  1816. +void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
  1817. + const struct in6_addr *daddr, int index);
  1818. +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
  1819. + const struct in6_addr *laddr, const struct net *net);
  1820. +int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
  1821. + struct mptcp_rem6 *rem);
  1822. +int mptcp_pm_v6_init(void);
  1823. +void mptcp_pm_v6_undo(void);
  1824. +struct sock *mptcp_v6v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  1825. + struct request_sock *req,
  1826. + struct dst_entry *dst);
  1827. +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
  1828. + __be16 sport, __be16 dport, u32 seq);
  1829. +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
  1830. + __be16 sport, __be16 dport);
  1831. +
  1832. +#else /* CONFIG_MPTCP */
  1833. +
  1834. +static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
  1835. +{
  1836. + return 0;
  1837. +}
  1838. +
  1839. +#endif /* CONFIG_MPTCP */
  1840. +
  1841. +#endif /* _MPTCP_V6_H */
  1842. diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
  1843. index 991dcd9..6297c97 100644
  1844. --- a/include/net/net_namespace.h
  1845. +++ b/include/net/net_namespace.h
  1846. @@ -15,6 +15,7 @@
  1847. #include <net/netns/packet.h>
  1848. #include <net/netns/ipv4.h>
  1849. #include <net/netns/ipv6.h>
  1850. +#include <net/netns/mptcp.h>
  1851. #include <net/netns/sctp.h>
  1852. #include <net/netns/dccp.h>
  1853. #include <net/netns/netfilter.h>
  1854. @@ -90,6 +91,9 @@ struct net {
  1855. #if IS_ENABLED(CONFIG_IPV6)
  1856. struct netns_ipv6 ipv6;
  1857. #endif
  1858. +#if IS_ENABLED(CONFIG_MPTCP)
  1859. + struct netns_mptcp mptcp;
  1860. +#endif
  1861. #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
  1862. struct netns_sctp sctp;
  1863. #endif
  1864. diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h
  1865. new file mode 100644
  1866. index 0000000..bad418b
  1867. --- /dev/null
  1868. +++ b/include/net/netns/mptcp.h
  1869. @@ -0,0 +1,44 @@
  1870. +/*
  1871. + * MPTCP implementation - MPTCP namespace
  1872. + *
  1873. + * Initial Design & Implementation:
  1874. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  1875. + *
  1876. + * Current Maintainer:
  1877. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  1878. + *
  1879. + * Additional authors:
  1880. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  1881. + * Gregory Detal <gregory.detal@uclouvain.be>
  1882. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  1883. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  1884. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  1885. + * Andreas Ripke <ripke@neclab.eu>
  1886. + * Vlad Dogaru <vlad.dogaru@intel.com>
  1887. + * Octavian Purdila <octavian.purdila@intel.com>
  1888. + * John Ronan <jronan@tssg.org>
  1889. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  1890. + * Brandon Heller <brandonh@stanford.edu>
  1891. + *
  1892. + *
  1893. + * This program is free software; you can redistribute it and/or
  1894. + * modify it under the terms of the GNU General Public License
  1895. + * as published by the Free Software Foundation; either version
  1896. + * 2 of the License, or (at your option) any later version.
  1897. + */
  1898. +
  1899. +#ifndef __NETNS_MPTCP_H__
  1900. +#define __NETNS_MPTCP_H__
  1901. +
  1902. +#include <linux/compiler.h>
  1903. +
  1904. +enum {
  1905. + MPTCP_PM_FULLMESH = 0,
  1906. + MPTCP_PM_MAX
  1907. +};
  1908. +
  1909. +struct netns_mptcp {
  1910. + void *path_managers[MPTCP_PM_MAX];
  1911. +};
  1912. +
  1913. +#endif /* __NETNS_MPTCP_H__ */
  1914. diff --git a/include/net/request_sock.h b/include/net/request_sock.h
  1915. index 7f830ff..e79e87a 100644
  1916. --- a/include/net/request_sock.h
  1917. +++ b/include/net/request_sock.h
  1918. @@ -164,7 +164,7 @@ struct request_sock_queue {
  1919. };
  1920. int reqsk_queue_alloc(struct request_sock_queue *queue,
  1921. - unsigned int nr_table_entries);
  1922. + unsigned int nr_table_entries, gfp_t flags);
  1923. void __reqsk_queue_destroy(struct request_sock_queue *queue);
  1924. void reqsk_queue_destroy(struct request_sock_queue *queue);
  1925. diff --git a/include/net/sock.h b/include/net/sock.h
  1926. index b9586a1..09a682e 100644
  1927. --- a/include/net/sock.h
  1928. +++ b/include/net/sock.h
  1929. @@ -899,6 +899,16 @@ void sk_clear_memalloc(struct sock *sk);
  1930. int sk_wait_data(struct sock *sk, long *timeo);
  1931. +/* START - needed for MPTCP */
  1932. +extern void sock_def_error_report(struct sock *sk);
  1933. +extern struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
  1934. + int family);
  1935. +extern void sock_lock_init(struct sock *sk);
  1936. +
  1937. +extern struct lock_class_key af_callback_keys[AF_MAX];
  1938. +extern char *const af_family_clock_key_strings[AF_MAX+1];
  1939. +/* END - needed for MPTCP */
  1940. +
  1941. struct request_sock_ops;
  1942. struct timewait_sock_ops;
  1943. struct inet_hashinfo;
  1944. diff --git a/include/net/tcp.h b/include/net/tcp.h
  1945. index 743acce..db0cc04 100644
  1946. --- a/include/net/tcp.h
  1947. +++ b/include/net/tcp.h
  1948. @@ -176,6 +176,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  1949. #define TCPOPT_SACK 5 /* SACK Block */
  1950. #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
  1951. #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
  1952. +#define TCPOPT_MPTCP 30
  1953. #define TCPOPT_EXP 254 /* Experimental */
  1954. /* Magic number to be after the option value for sharing TCP
  1955. * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
  1956. @@ -234,6 +235,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  1957. */
  1958. #define TFO_SERVER_ALWAYS 0x1000
  1959. +/* Flags from tcp_input.c for tcp_ack */
  1960. +#define FLAG_DATA 0x01 /* Incoming frame contained data. */
  1961. +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
  1962. +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
  1963. +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
  1964. +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
  1965. +#define FLAG_DATA_SACKED 0x20 /* New SACK. */
  1966. +#define FLAG_ECE 0x40 /* ECE in this ACK */
  1967. +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
  1968. +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
  1969. +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
  1970. +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
  1971. +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
  1972. +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
  1973. +#define MPTCP_FLAG_DATA_ACKED 0x8000
  1974. +
  1975. +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
  1976. +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
  1977. +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
  1978. +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
  1979. +
  1980. extern struct inet_timewait_death_row tcp_death_row;
  1981. /* sysctl variables for tcp */
  1982. @@ -349,6 +371,112 @@ extern struct proto tcp_prot;
  1983. #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
  1984. #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
  1985. +/**** START - Exports needed for MPTCP ****/
  1986. +extern const struct inet_connection_sock_af_ops ipv4_specific;
  1987. +extern const struct inet_connection_sock_af_ops ipv6_specific;
  1988. +extern const struct inet_connection_sock_af_ops ipv6_mapped;
  1989. +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
  1990. +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
  1991. +
  1992. +struct mptcp_options_received;
  1993. +
  1994. +int tcp_close_state(struct sock *sk);
  1995. +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int
  1996. + size_goal);
  1997. +void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
  1998. + const struct sk_buff *skb);
  1999. +int tcp_xmit_probe_skb(struct sock *sk, int urgent);
  2000. +void tcp_cwnd_validate(struct sock *sk);
  2001. +void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
  2002. +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  2003. + gfp_t gfp_mask);
  2004. +unsigned int tcp_mss_split_point(const struct sock *sk,
  2005. + const struct sk_buff *skb,
  2006. + unsigned int mss_now,
  2007. + unsigned int max_segs,
  2008. + int nonagle);
  2009. +bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb);
  2010. +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  2011. + unsigned int cur_mss, int nonagle);
  2012. +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  2013. + unsigned int cur_mss);
  2014. +unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
  2015. +int tcp_mtu_probe(struct sock *sk);
  2016. +int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
  2017. + unsigned int mss_now);
  2018. +void __pskb_trim_head(struct sk_buff *skb, int len);
  2019. +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
  2020. +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
  2021. +void tcp_reset(struct sock *sk);
  2022. +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
  2023. + const u32 ack_seq, const u32 nwin);
  2024. +bool tcp_urg_mode(const struct tcp_sock *tp);
  2025. +void tcp_ack_probe(struct sock *sk);
  2026. +void tcp_rearm_rto(struct sock *sk);
  2027. +int tcp_write_timeout(struct sock *sk);
  2028. +bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
  2029. + unsigned int timeout, bool syn_set);
  2030. +void tcp_write_err(struct sock *sk);
  2031. +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
  2032. +void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
  2033. + unsigned int mss_now);
  2034. +
  2035. +int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);
  2036. +void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  2037. + struct request_sock *req);
  2038. +__u32 tcp_v4_init_sequence(const struct sk_buff *skb);
  2039. +int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
  2040. + struct request_sock *req,
  2041. + u16 queue_mapping);
  2042. +void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);
  2043. +struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
  2044. +struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);
  2045. +void tcp_v4_reqsk_destructor(struct request_sock *req);
  2046. +
  2047. +int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);
  2048. +void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  2049. + struct request_sock *req);
  2050. +__u32 tcp_v6_init_sequence(const struct sk_buff *skb);
  2051. +int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
  2052. + struct flowi6 *fl6, struct request_sock *req,
  2053. + u16 queue_mapping);
  2054. +void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
  2055. +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
  2056. +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
  2057. +void tcp_v6_destroy_sock(struct sock *sk);
  2058. +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
  2059. +void tcp_v6_hash(struct sock *sk);
  2060. +struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
  2061. +struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  2062. + struct request_sock *req,
  2063. + struct dst_entry *dst);
  2064. +void tcp_v6_reqsk_destructor(struct request_sock *req);
  2065. +
  2066. +void sock_valbool_flag(struct sock *sk, int bit, int valbool);
  2067. +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
  2068. + int large_allowed);
  2069. +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
  2070. +
  2071. +void skb_clone_fraglist(struct sk_buff *skb);
  2072. +void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
  2073. +
  2074. +void inet_twsk_free(struct inet_timewait_sock *tw);
  2075. +/* These states need RST on ABORT according to RFC793 */
  2076. +static inline bool tcp_need_reset(int state)
  2077. +{
  2078. + return (1 << state) &
  2079. + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
  2080. + TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
  2081. +}
  2082. +
  2083. +bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
  2084. + int hlen);
  2085. +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
  2086. + bool *fragstolen);
  2087. +bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,
  2088. + struct sk_buff *from, bool *fragstolen);
  2089. +/**** END - Exports needed for MPTCP ****/
  2090. +
  2091. void tcp_tasklet_init(void);
  2092. void tcp_v4_err(struct sk_buff *skb, u32);
  2093. @@ -445,6 +573,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
  2094. size_t len, int nonblock, int flags, int *addr_len);
  2095. void tcp_parse_options(const struct sk_buff *skb,
  2096. struct tcp_options_received *opt_rx,
  2097. + struct mptcp_options_received *mopt_rx,
  2098. int estab, struct tcp_fastopen_cookie *foc);
  2099. const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
  2100. @@ -557,11 +686,15 @@ void tcp_send_delayed_ack(struct sock *sk);
  2101. void tcp_send_loss_probe(struct sock *sk);
  2102. bool tcp_schedule_loss_probe(struct sock *sk);
  2103. +u16 tcp_select_window(struct sock *sk);
  2104. +
  2105. /* tcp_input.c */
  2106. void tcp_cwnd_application_limited(struct sock *sk);
  2107. void tcp_resume_early_retransmit(struct sock *sk);
  2108. void tcp_rearm_rto(struct sock *sk);
  2109. void tcp_reset(struct sock *sk);
  2110. +void tcp_set_rto(struct sock *sk);
  2111. +bool tcp_should_expand_sndbuf(const struct sock *sk);
  2112. /* tcp_timer.c */
  2113. void tcp_init_xmit_timers(struct sock *);
  2114. @@ -705,14 +838,24 @@ void tcp_send_window_probe(struct sock *sk);
  2115. */
  2116. struct tcp_skb_cb {
  2117. union {
  2118. - struct inet_skb_parm h4;
  2119. + union {
  2120. + struct inet_skb_parm h4;
  2121. #if IS_ENABLED(CONFIG_IPV6)
  2122. - struct inet6_skb_parm h6;
  2123. + struct inet6_skb_parm h6;
  2124. #endif
  2125. - } header; /* For incoming frames */
  2126. + } header; /* For incoming frames */
  2127. +#ifdef CONFIG_MPTCP
  2128. + __u32 path_mask; /* path indices that tried to send this skb */
  2129. +#endif
  2130. + };
  2131. __u32 seq; /* Starting sequence number */
  2132. __u32 end_seq; /* SEQ + FIN + SYN + datalen */
  2133. __u32 when; /* used to compute rtt's */
  2134. +#ifdef CONFIG_MPTCP
  2135. + __u8 mptcp_flags; /* flags for the MPTCP layer */
  2136. + __u8 dss_off; /* Number of 4-byte words until
  2137. + * seq-number */
  2138. +#endif
  2139. __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
  2140. __u8 sacked; /* State flags for SACK/FACK. */
  2141. @@ -1058,7 +1201,8 @@ u32 tcp_default_init_rwnd(u32 mss);
  2142. /* Determine a window scaling and initial window to offer. */
  2143. void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
  2144. __u32 *window_clamp, int wscale_ok,
  2145. - __u8 *rcv_wscale, __u32 init_rcv_wnd);
  2146. + __u8 *rcv_wscale, __u32 init_rcv_wnd,
  2147. + const struct sock *sk);
  2148. static inline int tcp_win_from_space(int space)
  2149. {
  2150. @@ -1070,12 +1214,18 @@ static inline int tcp_win_from_space(int space)
  2151. /* Note: caller must be prepared to deal with negative returns */
  2152. static inline int tcp_space(const struct sock *sk)
  2153. {
  2154. + if (tcp_sk(sk)->mpc)
  2155. + sk = tcp_sk(sk)->meta_sk;
  2156. +
  2157. return tcp_win_from_space(sk->sk_rcvbuf -
  2158. atomic_read(&sk->sk_rmem_alloc));
  2159. }
  2160. static inline int tcp_full_space(const struct sock *sk)
  2161. {
  2162. + if (tcp_sk(sk)->mpc)
  2163. + sk = tcp_sk(sk)->meta_sk;
  2164. +
  2165. return tcp_win_from_space(sk->sk_rcvbuf);
  2166. }
  2167. @@ -1090,6 +1240,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
  2168. tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
  2169. tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
  2170. tcp_rsk(req)->snt_synack = 0;
  2171. + tcp_rsk(req)->saw_mpc = 0;
  2172. req->mss = rx_opt->mss_clamp;
  2173. req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
  2174. ireq->tstamp_ok = rx_opt->tstamp_ok;
  2175. diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
  2176. index d758163..3d81e49 100644
  2177. --- a/include/uapi/linux/if.h
  2178. +++ b/include/uapi/linux/if.h
  2179. @@ -53,6 +53,9 @@
  2180. #define IFF_ECHO 0x40000 /* echo sent packets */
  2181. +#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
  2182. +#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
  2183. +
  2184. #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
  2185. IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
  2186. diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
  2187. index 377f1e5..2ffcb03 100644
  2188. --- a/include/uapi/linux/tcp.h
  2189. +++ b/include/uapi/linux/tcp.h
  2190. @@ -112,6 +112,7 @@ enum {
  2191. #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
  2192. #define TCP_TIMESTAMP 24
  2193. #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
  2194. +#define MPTCP_ENABLED 26
  2195. struct tcp_repair_opt {
  2196. __u32 opt_code;
  2197. diff --git a/net/Kconfig b/net/Kconfig
  2198. index e411046..3e4b278 100644
  2199. --- a/net/Kconfig
  2200. +++ b/net/Kconfig
  2201. @@ -79,6 +79,7 @@ if INET
  2202. source "net/ipv4/Kconfig"
  2203. source "net/ipv6/Kconfig"
  2204. source "net/netlabel/Kconfig"
  2205. +source "net/mptcp/Kconfig"
  2206. endif # if INET
  2207. diff --git a/net/Makefile b/net/Makefile
  2208. index cbbbe6d..244bac1 100644
  2209. --- a/net/Makefile
  2210. +++ b/net/Makefile
  2211. @@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/
  2212. obj-$(CONFIG_XFRM) += xfrm/
  2213. obj-$(CONFIG_UNIX) += unix/
  2214. obj-$(CONFIG_NET) += ipv6/
  2215. +obj-$(CONFIG_MPTCP) += mptcp/
  2216. obj-$(CONFIG_PACKET) += packet/
  2217. obj-$(CONFIG_NET_KEY) += key/
  2218. obj-$(CONFIG_BRIDGE) += bridge/
  2219. diff --git a/net/core/dev.c b/net/core/dev.c
  2220. index 45fa2f1..3cfdbc0 100644
  2221. --- a/net/core/dev.c
  2222. +++ b/net/core/dev.c
  2223. @@ -5271,7 +5271,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
  2224. dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
  2225. IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
  2226. - IFF_AUTOMEDIA)) |
  2227. + IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
  2228. (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
  2229. IFF_ALLMULTI));
  2230. diff --git a/net/core/request_sock.c b/net/core/request_sock.c
  2231. index 4425148..e128f08 100644
  2232. --- a/net/core/request_sock.c
  2233. +++ b/net/core/request_sock.c
  2234. @@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256;
  2235. EXPORT_SYMBOL(sysctl_max_syn_backlog);
  2236. int reqsk_queue_alloc(struct request_sock_queue *queue,
  2237. - unsigned int nr_table_entries)
  2238. + unsigned int nr_table_entries,
  2239. + gfp_t flags)
  2240. {
  2241. size_t lopt_size = sizeof(struct listen_sock);
  2242. struct listen_sock *lopt;
  2243. @@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
  2244. nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
  2245. lopt_size += nr_table_entries * sizeof(struct request_sock *);
  2246. if (lopt_size > PAGE_SIZE)
  2247. - lopt = vzalloc(lopt_size);
  2248. + lopt = __vmalloc(lopt_size,
  2249. + flags | __GFP_HIGHMEM | __GFP_ZERO,
  2250. + PAGE_KERNEL);
  2251. else
  2252. - lopt = kzalloc(lopt_size, GFP_KERNEL);
  2253. + lopt = kzalloc(lopt_size, flags);
  2254. if (lopt == NULL)
  2255. return -ENOMEM;
  2256. diff --git a/net/core/skbuff.c b/net/core/skbuff.c
  2257. index 90b96a1..2564d89 100644
  2258. --- a/net/core/skbuff.c
  2259. +++ b/net/core/skbuff.c
  2260. @@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
  2261. skb_drop_list(&skb_shinfo(skb)->frag_list);
  2262. }
  2263. -static void skb_clone_fraglist(struct sk_buff *skb)
  2264. +void skb_clone_fraglist(struct sk_buff *skb)
  2265. {
  2266. struct sk_buff *list;
  2267. @@ -894,7 +894,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
  2268. skb->inner_mac_header += off;
  2269. }
  2270. -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  2271. +void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  2272. {
  2273. __copy_skb_header(new, old);
  2274. diff --git a/net/core/sock.c b/net/core/sock.c
  2275. index c0fc6bd..7314971 100644
  2276. --- a/net/core/sock.c
  2277. +++ b/net/core/sock.c
  2278. @@ -231,7 +231,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
  2279. "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
  2280. "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
  2281. };
  2282. -static const char *const af_family_clock_key_strings[AF_MAX+1] = {
  2283. +char *const af_family_clock_key_strings[AF_MAX+1] = {
  2284. "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
  2285. "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
  2286. "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
  2287. @@ -252,7 +252,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
  2288. * sk_callback_lock locking rules are per-address-family,
  2289. * so split the lock classes by using a per-AF key:
  2290. */
  2291. -static struct lock_class_key af_callback_keys[AF_MAX];
  2292. +struct lock_class_key af_callback_keys[AF_MAX];
  2293. /* Take into consideration the size of the struct sk_buff overhead in the
  2294. * determination of these values, since that is non-constant across
  2295. @@ -602,7 +602,7 @@ out:
  2296. return ret;
  2297. }
  2298. -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
  2299. +void sock_valbool_flag(struct sock *sk, int bit, int valbool)
  2300. {
  2301. if (valbool)
  2302. sock_set_flag(sk, bit);
  2303. @@ -1204,7 +1204,7 @@ lenout:
  2304. *
  2305. * (We also register the sk_lock with the lock validator.)
  2306. */
  2307. -static inline void sock_lock_init(struct sock *sk)
  2308. +void sock_lock_init(struct sock *sk)
  2309. {
  2310. sock_lock_init_class_and_name(sk,
  2311. af_family_slock_key_strings[sk->sk_family],
  2312. @@ -1252,7 +1252,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
  2313. }
  2314. EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
  2315. -static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
  2316. +struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
  2317. int family)
  2318. {
  2319. struct sock *sk;
  2320. @@ -2184,7 +2184,7 @@ static void sock_def_wakeup(struct sock *sk)
  2321. rcu_read_unlock();
  2322. }
  2323. -static void sock_def_error_report(struct sock *sk)
  2324. +void sock_def_error_report(struct sock *sk)
  2325. {
  2326. struct socket_wq *wq;
  2327. diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
  2328. index 05c57f0..630434d 100644
  2329. --- a/net/ipv4/Kconfig
  2330. +++ b/net/ipv4/Kconfig
  2331. @@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS
  2332. For further details see:
  2333. http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
  2334. +config TCP_CONG_COUPLED
  2335. + tristate "MPTCP COUPLED CONGESTION CONTROL"
  2336. + depends on MPTCP
  2337. + default n
  2338. + ---help---
  2339. + MultiPath TCP Coupled Congestion Control
  2340. + To enable it, just put 'coupled' in tcp_congestion_control
  2341. +
  2342. +config TCP_CONG_OLIA
  2343. + tristate "MPTCP Opportunistic Linked Increase"
  2344. + depends on MPTCP
  2345. + default n
  2346. + ---help---
  2347. + MultiPath TCP Opportunistic Linked Increase Congestion Control
  2348. + To enable it, just put 'olia' in tcp_congestion_control
  2349. +
  2350. +config TCP_CONG_WVEGAS
  2351. + tristate "MPTCP WVEGAS CONGESTION CONTROL"
  2352. + depends on MPTCP
  2353. + default n
  2354. + ---help---
  2355. + wVegas congestion control for MPTCP
  2356. + To enable it, just put 'wvegas' in tcp_congestion_control
  2357. +
  2358. choice
  2359. prompt "Default TCP congestion control"
  2360. default DEFAULT_CUBIC
  2361. @@ -584,6 +608,15 @@ choice
  2362. config DEFAULT_WESTWOOD
  2363. bool "Westwood" if TCP_CONG_WESTWOOD=y
  2364. + config DEFAULT_COUPLED
  2365. + bool "Coupled" if TCP_CONG_COUPLED=y
  2366. +
  2367. + config DEFAULT_OLIA
  2368. + bool "Olia" if TCP_CONG_OLIA=y
  2369. +
  2370. + config DEFAULT_WVEGAS
  2371. + bool "Wvegas" if TCP_CONG_WVEGAS=y
  2372. +
  2373. config DEFAULT_RENO
  2374. bool "Reno"
  2375. @@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG
  2376. default "vegas" if DEFAULT_VEGAS
  2377. default "westwood" if DEFAULT_WESTWOOD
  2378. default "veno" if DEFAULT_VENO
  2379. + default "coupled" if DEFAULT_COUPLED
  2380. + default "wvegas" if DEFAULT_WVEGAS
  2381. default "reno" if DEFAULT_RENO
  2382. default "cubic"
  2383. diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
  2384. index 19ab78a..567918a 100644
  2385. --- a/net/ipv4/af_inet.c
  2386. +++ b/net/ipv4/af_inet.c
  2387. @@ -104,6 +104,7 @@
  2388. #include <net/ip_fib.h>
  2389. #include <net/inet_connection_sock.h>
  2390. #include <net/tcp.h>
  2391. +#include <net/mptcp.h>
  2392. #include <net/udp.h>
  2393. #include <net/udplite.h>
  2394. #include <net/ping.h>
  2395. @@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen);
  2396. * Create an inet socket.
  2397. */
  2398. -static int inet_create(struct net *net, struct socket *sock, int protocol,
  2399. - int kern)
  2400. +int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
  2401. {
  2402. struct sock *sk;
  2403. struct inet_protosw *answer;
  2404. @@ -679,6 +679,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
  2405. lock_sock(sk2);
  2406. sock_rps_record_flow(sk2);
  2407. +
  2408. + if (sk2->sk_protocol == IPPROTO_TCP && tcp_sk(sk2)->mpc) {
  2409. + struct sock *sk_it = sk2;
  2410. +
  2411. + mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
  2412. + sock_rps_record_flow(sk_it);
  2413. +
  2414. + if (tcp_sk(sk2)->mpcb->master_sk) {
  2415. + sk_it = tcp_sk(sk2)->mpcb->master_sk;
  2416. +
  2417. + write_lock_bh(&sk_it->sk_callback_lock);
  2418. + sk_it->sk_wq = newsock->wq;
  2419. + sk_it->sk_socket = newsock;
  2420. + write_unlock_bh(&sk_it->sk_callback_lock);
  2421. + }
  2422. + }
  2423. +
  2424. WARN_ON(!((1 << sk2->sk_state) &
  2425. (TCPF_ESTABLISHED | TCPF_SYN_RECV |
  2426. TCPF_CLOSE_WAIT | TCPF_CLOSE)));
  2427. @@ -1767,6 +1784,9 @@ static int __init inet_init(void)
  2428. ip_init();
  2429. + /* We must initialize MPTCP before TCP. */
  2430. + mptcp_init();
  2431. +
  2432. tcp_v4_init();
  2433. /* Setup TCP slab cache for open requests. */
  2434. diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
  2435. index 0d1e2cb..423dfb6 100644
  2436. --- a/net/ipv4/inet_connection_sock.c
  2437. +++ b/net/ipv4/inet_connection_sock.c
  2438. @@ -23,6 +23,7 @@
  2439. #include <net/route.h>
  2440. #include <net/tcp_states.h>
  2441. #include <net/xfrm.h>
  2442. +#include <net/mptcp.h>
  2443. #ifdef INET_CSK_DEBUG
  2444. const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
  2445. @@ -468,8 +469,8 @@ no_route:
  2446. }
  2447. EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
  2448. -static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
  2449. - const u32 rnd, const u32 synq_hsize)
  2450. +u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
  2451. + const u32 synq_hsize)
  2452. {
  2453. return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
  2454. }
  2455. @@ -667,7 +668,12 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
  2456. const struct request_sock *req,
  2457. const gfp_t priority)
  2458. {
  2459. - struct sock *newsk = sk_clone_lock(sk, priority);
  2460. + struct sock *newsk;
  2461. +
  2462. + if (sk->sk_protocol == IPPROTO_TCP && tcp_sk(sk)->mpc)
  2463. + newsk = mptcp_sk_clone(sk, req->rsk_ops->family, priority);
  2464. + else
  2465. + newsk = sk_clone_lock(sk, priority);
  2466. if (newsk != NULL) {
  2467. struct inet_connection_sock *newicsk = inet_csk(newsk);
  2468. @@ -744,7 +750,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
  2469. {
  2470. struct inet_sock *inet = inet_sk(sk);
  2471. struct inet_connection_sock *icsk = inet_csk(sk);
  2472. - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
  2473. + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,
  2474. + GFP_KERNEL);
  2475. if (rc != 0)
  2476. return rc;
  2477. @@ -802,9 +809,14 @@ void inet_csk_listen_stop(struct sock *sk)
  2478. while ((req = acc_req) != NULL) {
  2479. struct sock *child = req->sk;
  2480. + bool mutex_taken = false;
  2481. acc_req = req->dl_next;
  2482. + if (is_meta_sk(child)) {
  2483. + mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);
  2484. + mutex_taken = true;
  2485. + }
  2486. local_bh_disable();
  2487. bh_lock_sock(child);
  2488. WARN_ON(sock_owned_by_user(child));
  2489. @@ -833,6 +845,8 @@ void inet_csk_listen_stop(struct sock *sk)
  2490. bh_unlock_sock(child);
  2491. local_bh_enable();
  2492. + if (mutex_taken)
  2493. + mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);
  2494. sock_put(child);
  2495. sk_acceptq_removed(sk);
  2496. diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
  2497. index f2ed13c..f08addc 100644
  2498. --- a/net/ipv4/syncookies.c
  2499. +++ b/net/ipv4/syncookies.c
  2500. @@ -284,7 +284,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
  2501. /* check for timestamp cookie support */
  2502. memset(&tcp_opt, 0, sizeof(tcp_opt));
  2503. - tcp_parse_options(skb, &tcp_opt, 0, NULL);
  2504. + tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
  2505. if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
  2506. goto out;
  2507. @@ -354,10 +354,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
  2508. /* Try to redo what tcp_v4_send_synack did. */
  2509. req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
  2510. - tcp_select_initial_window(tcp_full_space(sk), req->mss,
  2511. + tp->select_initial_window(tcp_full_space(sk), req->mss,
  2512. &req->rcv_wnd, &req->window_clamp,
  2513. ireq->wscale_ok, &rcv_wscale,
  2514. - dst_metric(&rt->dst, RTAX_INITRWND));
  2515. + dst_metric(&rt->dst, RTAX_INITRWND), sk);
  2516. ireq->rcv_wscale = rcv_wscale;
  2517. diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
  2518. index 97c8f56..be72a40 100644
  2519. --- a/net/ipv4/tcp.c
  2520. +++ b/net/ipv4/tcp.c
  2521. @@ -271,6 +271,7 @@
  2522. #include <net/icmp.h>
  2523. #include <net/inet_common.h>
  2524. +#include <net/mptcp.h>
  2525. #include <net/tcp.h>
  2526. #include <net/xfrm.h>
  2527. #include <net/ip.h>
  2528. @@ -419,6 +420,9 @@ void tcp_init_sock(struct sock *sk)
  2529. sk->sk_sndbuf = sysctl_tcp_wmem[1];
  2530. sk->sk_rcvbuf = sysctl_tcp_rmem[1];
  2531. + /* Set function pointers in tcp_sock to tcp functions. */
  2532. + mptcp_init_tcp_sock(tp);
  2533. +
  2534. local_bh_disable();
  2535. sock_update_memcg(sk);
  2536. sk_sockets_allocated_inc(sk);
  2537. @@ -607,6 +611,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
  2538. tcb->seq = tcb->end_seq = tp->write_seq;
  2539. tcb->tcp_flags = TCPHDR_ACK;
  2540. tcb->sacked = 0;
  2541. + if (tp->mpc)
  2542. + mptcp_skb_entail_init(tp, skb);
  2543. skb_header_release(skb);
  2544. tcp_add_write_queue_tail(sk, skb);
  2545. sk->sk_wmem_queued += skb->truesize;
  2546. @@ -640,8 +646,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
  2547. atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
  2548. }
  2549. -static void tcp_push(struct sock *sk, int flags, int mss_now,
  2550. - int nonagle, int size_goal)
  2551. +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
  2552. + int size_goal)
  2553. {
  2554. struct tcp_sock *tp = tcp_sk(sk);
  2555. struct sk_buff *skb;
  2556. @@ -726,6 +732,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
  2557. int ret;
  2558. sock_rps_record_flow(sk);
  2559. +
  2560. +#ifdef CONFIG_MPTCP
  2561. + if (tcp_sk(sk)->mpc) {
  2562. + struct sock *sk_it;
  2563. + mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
  2564. + sock_rps_record_flow(sk_it);
  2565. + }
  2566. +#endif
  2567. /*
  2568. * We can't seek on a socket input
  2569. */
  2570. @@ -821,8 +835,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
  2571. return NULL;
  2572. }
  2573. -static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
  2574. - int large_allowed)
  2575. +unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
  2576. {
  2577. struct tcp_sock *tp = tcp_sk(sk);
  2578. u32 xmit_size_goal, old_size_goal;
  2579. @@ -872,8 +885,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
  2580. {
  2581. int mss_now;
  2582. - mss_now = tcp_current_mss(sk);
  2583. - *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
  2584. + if (tcp_sk(sk)->mpc) {
  2585. + mss_now = mptcp_current_mss(sk);
  2586. + *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
  2587. + } else {
  2588. + mss_now = tcp_current_mss(sk);
  2589. + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
  2590. + }
  2591. return mss_now;
  2592. }
  2593. @@ -897,6 +915,26 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
  2594. goto out_err;
  2595. }
  2596. + if (tp->mpc) {
  2597. + struct sock *sk_it = sk;
  2598. +
  2599. + /* We must check this with socket-lock hold because we iterate
  2600. + * over the subflows.
  2601. + */
  2602. + if (!mptcp_can_sendpage(sk)) {
  2603. + ssize_t ret;
  2604. +
  2605. + release_sock(sk);
  2606. + ret = sock_no_sendpage(sk->sk_socket, page, offset,
  2607. + size, flags);
  2608. + lock_sock(sk);
  2609. + return ret;
  2610. + }
  2611. +
  2612. + mptcp_for_each_sk(tp->mpcb, sk_it)
  2613. + sock_rps_record_flow(sk_it);
  2614. + }
  2615. +
  2616. clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
  2617. mss_now = tcp_send_mss(sk, &size_goal, flags);
  2618. @@ -1001,8 +1039,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
  2619. {
  2620. ssize_t res;
  2621. - if (!(sk->sk_route_caps & NETIF_F_SG) ||
  2622. - !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
  2623. + /* If MPTCP is enabled, we check it later after establishment */
  2624. + if (!tcp_sk(sk)->mpc && (!(sk->sk_route_caps & NETIF_F_SG) ||
  2625. + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))
  2626. return sock_no_sendpage(sk->sk_socket, page, offset, size,
  2627. flags);
  2628. @@ -1018,6 +1057,9 @@ static inline int select_size(const struct sock *sk, bool sg)
  2629. const struct tcp_sock *tp = tcp_sk(sk);
  2630. int tmp = tp->mss_cache;
  2631. + if (tp->mpc)
  2632. + return mptcp_select_size(sk, sg);
  2633. +
  2634. if (sg) {
  2635. if (sk_can_gso(sk)) {
  2636. /* Small frames wont use a full page:
  2637. @@ -1105,6 +1147,12 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
  2638. goto do_error;
  2639. }
  2640. + if (tp->mpc) {
  2641. + struct sock *sk_it = sk;
  2642. + mptcp_for_each_sk(tp->mpcb, sk_it)
  2643. + sock_rps_record_flow(sk_it);
  2644. + }
  2645. +
  2646. if (unlikely(tp->repair)) {
  2647. if (tp->repair_queue == TCP_RECV_QUEUE) {
  2648. copied = tcp_send_rcvq(sk, msg, size);
  2649. @@ -1132,7 +1180,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
  2650. if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
  2651. goto out_err;
  2652. - sg = !!(sk->sk_route_caps & NETIF_F_SG);
  2653. + if (tp->mpc)
  2654. + sg = mptcp_can_sg(sk);
  2655. + else
  2656. + sg = !!(sk->sk_route_caps & NETIF_F_SG);
  2657. while (--iovlen >= 0) {
  2658. size_t seglen = iov->iov_len;
  2659. @@ -1183,8 +1234,15 @@ new_segment:
  2660. /*
  2661. * Check whether we can use HW checksum.
  2662. + *
  2663. + * If dss-csum is enabled, we do not do hw-csum.
  2664. + * In case of non-mptcp we check the
  2665. + * device-capabilities.
  2666. + * In case of mptcp, hw-csum's will be handled
  2667. + * later in mptcp_write_xmit.
  2668. */
  2669. - if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
  2670. + if (((tp->mpc && !tp->mpcb->dss_csum) || !tp->mpc) &&
  2671. + (tp->mpc || sk->sk_route_caps & NETIF_F_ALL_CSUM))
  2672. skb->ip_summed = CHECKSUM_PARTIAL;
  2673. skb_entail(sk, skb);
  2674. @@ -1385,6 +1443,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
  2675. struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
  2676. + if (is_meta_sk(sk)) {
  2677. + mptcp_cleanup_rbuf(sk, copied);
  2678. + return;
  2679. + }
  2680. +
  2681. WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
  2682. "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
  2683. tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
  2684. @@ -1421,7 +1484,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
  2685. /* Optimize, __tcp_select_window() is not cheap. */
  2686. if (2*rcv_window_now <= tp->window_clamp) {
  2687. - __u32 new_window = __tcp_select_window(sk);
  2688. + __u32 new_window = tp->__select_window(sk);
  2689. /* Send ACK now, if this read freed lots of space
  2690. * in our buffer. Certainly, new_window is new window.
  2691. @@ -1622,6 +1685,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
  2692. lock_sock(sk);
  2693. +#ifdef CONFIG_MPTCP
  2694. + if (tp->mpc) {
  2695. + struct sock *sk_it;
  2696. + mptcp_for_each_sk(tp->mpcb, sk_it)
  2697. + sock_rps_record_flow(sk_it);
  2698. + }
  2699. +#endif
  2700. +
  2701. err = -ENOTCONN;
  2702. if (sk->sk_state == TCP_LISTEN)
  2703. goto out;
  2704. @@ -2069,7 +2140,7 @@ static const unsigned char new_state[16] = {
  2705. /* TCP_CLOSING */ TCP_CLOSING,
  2706. };
  2707. -static int tcp_close_state(struct sock *sk)
  2708. +int tcp_close_state(struct sock *sk)
  2709. {
  2710. int next = (int)new_state[sk->sk_state];
  2711. int ns = next & TCP_STATE_MASK;
  2712. @@ -2098,8 +2169,12 @@ void tcp_shutdown(struct sock *sk, int how)
  2713. (TCPF_ESTABLISHED | TCPF_SYN_SENT |
  2714. TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
  2715. /* Clear out any half completed packets. FIN if needed. */
  2716. - if (tcp_close_state(sk))
  2717. - tcp_send_fin(sk);
  2718. + if (tcp_close_state(sk)) {
  2719. + if (!is_meta_sk(sk))
  2720. + tcp_send_fin(sk);
  2721. + else
  2722. + mptcp_send_fin(sk);
  2723. + }
  2724. }
  2725. }
  2726. EXPORT_SYMBOL(tcp_shutdown);
  2727. @@ -2124,6 +2199,11 @@ void tcp_close(struct sock *sk, long timeout)
  2728. int data_was_unread = 0;
  2729. int state;
  2730. + if (is_meta_sk(sk)) {
  2731. + mptcp_close(sk, timeout);
  2732. + return;
  2733. + }
  2734. +
  2735. lock_sock(sk);
  2736. sk->sk_shutdown = SHUTDOWN_MASK;
  2737. @@ -2290,15 +2370,6 @@ out:
  2738. }
  2739. EXPORT_SYMBOL(tcp_close);
  2740. -/* These states need RST on ABORT according to RFC793 */
  2741. -
  2742. -static inline bool tcp_need_reset(int state)
  2743. -{
  2744. - return (1 << state) &
  2745. - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
  2746. - TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
  2747. -}
  2748. -
  2749. int tcp_disconnect(struct sock *sk, int flags)
  2750. {
  2751. struct inet_sock *inet = inet_sk(sk);
  2752. @@ -2339,6 +2410,13 @@ int tcp_disconnect(struct sock *sk, int flags)
  2753. if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
  2754. inet_reset_saddr(sk);
  2755. + if (is_meta_sk(sk)) {
  2756. + mptcp_disconnect(sk);
  2757. + } else {
  2758. + if (tp->inside_tk_table)
  2759. + mptcp_hash_remove_bh(tp);
  2760. + }
  2761. +
  2762. sk->sk_shutdown = 0;
  2763. sock_reset_flag(sk, SOCK_DONE);
  2764. tp->srtt = 0;
  2765. @@ -2698,6 +2776,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
  2766. tp->notsent_lowat = val;
  2767. sk->sk_write_space(sk);
  2768. break;
  2769. +#ifdef CONFIG_MPTCP
  2770. + case MPTCP_ENABLED:
  2771. + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {
  2772. + if (val)
  2773. + tp->mptcp_enabled = 1;
  2774. + else
  2775. + tp->mptcp_enabled = 0;
  2776. + } else {
  2777. + err = -EPERM;
  2778. + }
  2779. + break;
  2780. +#endif
  2781. default:
  2782. err = -ENOPROTOOPT;
  2783. break;
  2784. @@ -2917,6 +3007,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
  2785. case TCP_NOTSENT_LOWAT:
  2786. val = tp->notsent_lowat;
  2787. break;
  2788. +#ifdef CONFIG_MPTCP
  2789. + case MPTCP_ENABLED:
  2790. + val = tp->mptcp_enabled;
  2791. + break;
  2792. +#endif
  2793. default:
  2794. return -ENOPROTOOPT;
  2795. }
  2796. @@ -3106,8 +3201,11 @@ void tcp_done(struct sock *sk)
  2797. if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
  2798. TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
  2799. + WARN_ON(sk->sk_state == TCP_CLOSE);
  2800. tcp_set_state(sk, TCP_CLOSE);
  2801. +
  2802. tcp_clear_xmit_timers(sk);
  2803. +
  2804. if (req != NULL)
  2805. reqsk_fastopen_remove(sk, req, false);
  2806. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
  2807. index eeaac39..cb06531 100644
  2808. --- a/net/ipv4/tcp_input.c
  2809. +++ b/net/ipv4/tcp_input.c
  2810. @@ -74,6 +74,9 @@
  2811. #include <linux/ipsec.h>
  2812. #include <asm/unaligned.h>
  2813. #include <net/netdma.h>
  2814. +#include <net/mptcp.h>
  2815. +#include <net/mptcp_v4.h>
  2816. +#include <net/mptcp_v6.h>
  2817. int sysctl_tcp_timestamps __read_mostly = 1;
  2818. int sysctl_tcp_window_scaling __read_mostly = 1;
  2819. @@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly;
  2820. int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
  2821. int sysctl_tcp_early_retrans __read_mostly = 3;
  2822. -#define FLAG_DATA 0x01 /* Incoming frame contained data. */
  2823. -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
  2824. -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
  2825. -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
  2826. -#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
  2827. -#define FLAG_DATA_SACKED 0x20 /* New SACK. */
  2828. -#define FLAG_ECE 0x40 /* ECE in this ACK */
  2829. -#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
  2830. -#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
  2831. -#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
  2832. -#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
  2833. -#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
  2834. -#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
  2835. -
  2836. -#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
  2837. -#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
  2838. -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
  2839. -#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
  2840. -
  2841. #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
  2842. #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
  2843. @@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
  2844. per_mss = roundup_pow_of_two(per_mss) +
  2845. SKB_DATA_ALIGN(sizeof(struct sk_buff));
  2846. - nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
  2847. - nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
  2848. + if (tp->mpc) {
  2849. + nr_segs = mptcp_check_snd_buf(tp);
  2850. + } else {
  2851. + nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
  2852. + nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
  2853. + }
  2854. /* Fast Recovery (RFC 5681 3.2) :
  2855. * Cubic needs 1.7 factor, rounded to 2 to include
  2856. @@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk)
  2857. */
  2858. sndmem = 2 * nr_segs * per_mss;
  2859. - if (sk->sk_sndbuf < sndmem)
  2860. + /* MPTCP: after this sndmem is the new contribution of the
  2861. + * current subflow to the aggregated sndbuf */
  2862. + if (sk->sk_sndbuf < sndmem) {
  2863. + int old_sndbuf = sk->sk_sndbuf;
  2864. sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
  2865. + /* MPTCP: ok, the subflow sndbuf has grown, reflect
  2866. + * this in the aggregate buffer.*/
  2867. + if (tp->mpc && old_sndbuf != sk->sk_sndbuf)
  2868. + mptcp_update_sndbuf(tp->mpcb);
  2869. + }
  2870. }
  2871. /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
  2872. @@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
  2873. static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
  2874. {
  2875. struct tcp_sock *tp = tcp_sk(sk);
  2876. + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  2877. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  2878. /* Check #1 */
  2879. - if (tp->rcv_ssthresh < tp->window_clamp &&
  2880. - (int)tp->rcv_ssthresh < tcp_space(sk) &&
  2881. + if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
  2882. + (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&
  2883. !sk_under_memory_pressure(sk)) {
  2884. int incr;
  2885. @@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
  2886. * will fit to rcvbuf in future.
  2887. */
  2888. if (tcp_win_from_space(skb->truesize) <= skb->len)
  2889. - incr = 2 * tp->advmss;
  2890. + incr = 2 * meta_tp->advmss;
  2891. else
  2892. - incr = __tcp_grow_window(sk, skb);
  2893. + incr = __tcp_grow_window(meta_sk, skb);
  2894. if (incr) {
  2895. incr = max_t(int, incr, 2 * skb->len);
  2896. - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
  2897. - tp->window_clamp);
  2898. + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
  2899. + meta_tp->window_clamp);
  2900. inet_csk(sk)->icsk_ack.quick |= 1;
  2901. }
  2902. }
  2903. @@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
  2904. int copied;
  2905. time = tcp_time_stamp - tp->rcvq_space.time;
  2906. - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
  2907. + if (tp->mpc) {
  2908. + if (mptcp_check_rtt(tp, time))
  2909. + return;
  2910. + } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
  2911. return;
  2912. /* Number of bytes copied to user in last RTT */
  2913. @@ -768,7 +769,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
  2914. /* Calculate rto without backoff. This is the second half of Van Jacobson's
  2915. * routine referred to above.
  2916. */
  2917. -static void tcp_set_rto(struct sock *sk)
  2918. +void tcp_set_rto(struct sock *sk)
  2919. {
  2920. const struct tcp_sock *tp = tcp_sk(sk);
  2921. /* Old crap is replaced with new one. 8)
  2922. @@ -2914,7 +2915,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
  2923. return false;
  2924. tcp_rtt_estimator(sk, seq_rtt);
  2925. - tcp_set_rto(sk);
  2926. + tp->set_rto(sk);
  2927. /* RFC6298: only reset backoff on valid RTT measurement. */
  2928. inet_csk(sk)->icsk_backoff = 0;
  2929. @@ -2998,7 +2999,7 @@ void tcp_resume_early_retransmit(struct sock *sk)
  2930. }
  2931. /* If we get here, the whole TSO packet has not been acked. */
  2932. -static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
  2933. +u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
  2934. {
  2935. struct tcp_sock *tp = tcp_sk(sk);
  2936. u32 packets_acked;
  2937. @@ -3092,6 +3093,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  2938. */
  2939. if (!(scb->tcp_flags & TCPHDR_SYN)) {
  2940. flag |= FLAG_DATA_ACKED;
  2941. + if (tp->mpc && mptcp_is_data_seq(skb))
  2942. + flag |= MPTCP_FLAG_DATA_ACKED;
  2943. } else {
  2944. flag |= FLAG_SYN_ACKED;
  2945. tp->retrans_stamp = 0;
  2946. @@ -3194,7 +3197,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  2947. return flag;
  2948. }
  2949. -static void tcp_ack_probe(struct sock *sk)
  2950. +void tcp_ack_probe(struct sock *sk)
  2951. {
  2952. const struct tcp_sock *tp = tcp_sk(sk);
  2953. struct inet_connection_sock *icsk = inet_csk(sk);
  2954. @@ -3241,9 +3244,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
  2955. /* Check that window update is acceptable.
  2956. * The function assumes that snd_una<=ack<=snd_next.
  2957. */
  2958. -static inline bool tcp_may_update_window(const struct tcp_sock *tp,
  2959. - const u32 ack, const u32 ack_seq,
  2960. - const u32 nwin)
  2961. +bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
  2962. + const u32 ack_seq, const u32 nwin)
  2963. {
  2964. return after(ack, tp->snd_una) ||
  2965. after(ack_seq, tp->snd_wl1) ||
  2966. @@ -3362,7 +3364,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
  2967. }
  2968. /* This routine deals with incoming acks, but not outgoing ones. */
  2969. -static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  2970. +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
  2971. {
  2972. struct inet_connection_sock *icsk = inet_csk(sk);
  2973. struct tcp_sock *tp = tcp_sk(sk);
  2974. @@ -3455,6 +3457,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  2975. flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
  2976. acked -= tp->packets_out;
  2977. + if (tp->mpc) {
  2978. + if (mptcp_fallback_infinite(sk, flag)) {
  2979. + pr_err("%s resetting flow\n", __func__);
  2980. + mptcp_send_reset(sk);
  2981. + goto invalid_ack;
  2982. + }
  2983. +
  2984. + mptcp_clean_rtx_infinite(skb, sk);
  2985. + }
  2986. +
  2987. /* Advance cwnd if state allows */
  2988. if (tcp_may_raise_cwnd(sk, flag))
  2989. tcp_cong_avoid(sk, ack, acked, prior_in_flight);
  2990. @@ -3519,8 +3531,9 @@ old_ack:
  2991. * the fast version below fails.
  2992. */
  2993. void tcp_parse_options(const struct sk_buff *skb,
  2994. - struct tcp_options_received *opt_rx, int estab,
  2995. - struct tcp_fastopen_cookie *foc)
  2996. + struct tcp_options_received *opt_rx,
  2997. + struct mptcp_options_received *mopt,
  2998. + int estab, struct tcp_fastopen_cookie *foc)
  2999. {
  3000. const unsigned char *ptr;
  3001. const struct tcphdr *th = tcp_hdr(skb);
  3002. @@ -3603,6 +3616,10 @@ void tcp_parse_options(const struct sk_buff *skb,
  3003. */
  3004. break;
  3005. #endif
  3006. + case TCPOPT_MPTCP:
  3007. + mptcp_parse_options(ptr - 2, opsize, opt_rx,
  3008. + mopt, skb);
  3009. + break;
  3010. case TCPOPT_EXP:
  3011. /* Fast Open option shares code 254 using a
  3012. * 16 bits magic number. It's valid only in
  3013. @@ -3664,8 +3681,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
  3014. if (tcp_parse_aligned_timestamp(tp, th))
  3015. return true;
  3016. }
  3017. -
  3018. - tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
  3019. + tcp_parse_options(skb, &tp->rx_opt, tp->mpc ? &tp->mptcp->rx_opt : NULL,
  3020. + 1, NULL);
  3021. if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
  3022. tp->rx_opt.rcv_tsecr -= tp->tsoffset;
  3023. @@ -3838,6 +3855,8 @@ static void tcp_fin(struct sock *sk)
  3024. dst = __sk_dst_get(sk);
  3025. if (!dst || !dst_metric(dst, RTAX_QUICKACK))
  3026. inet_csk(sk)->icsk_ack.pingpong = 1;
  3027. + if (tp->mpc)
  3028. + mptcp_sub_close_passive(sk);
  3029. break;
  3030. case TCP_CLOSE_WAIT:
  3031. @@ -3859,6 +3878,13 @@ static void tcp_fin(struct sock *sk)
  3032. tcp_set_state(sk, TCP_CLOSING);
  3033. break;
  3034. case TCP_FIN_WAIT2:
  3035. + if (tp->mpc) {
  3036. + /* The socket will get closed by mptcp_data_ready.
  3037. + * We first have to process all data-sequences.
  3038. + */
  3039. + tp->close_it = 1;
  3040. + break;
  3041. + }
  3042. /* Received a FIN -- send ACK and enter TIME_WAIT. */
  3043. tcp_send_ack(sk);
  3044. tcp_time_wait(sk, TCP_TIME_WAIT, 0);
  3045. @@ -3883,6 +3909,10 @@ static void tcp_fin(struct sock *sk)
  3046. if (!sock_flag(sk, SOCK_DEAD)) {
  3047. sk->sk_state_change(sk);
  3048. + /* Don't wake up MPTCP-subflows */
  3049. + if (tp->mpc)
  3050. + return;
  3051. +
  3052. /* Do not send POLL_HUP for half duplex close. */
  3053. if (sk->sk_shutdown == SHUTDOWN_MASK ||
  3054. sk->sk_state == TCP_CLOSE)
  3055. @@ -4080,7 +4110,11 @@ static void tcp_ofo_queue(struct sock *sk)
  3056. tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
  3057. }
  3058. - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
  3059. + /* In case of MPTCP, the segment may be empty if it's a
  3060. + * non-data DATA_FIN. (see beginning of tcp_data_queue)
  3061. + */
  3062. + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
  3063. + !(tp->mpc && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {
  3064. SOCK_DEBUG(sk, "ofo packet was already received\n");
  3065. __skb_unlink(skb, &tp->out_of_order_queue);
  3066. __kfree_skb(skb);
  3067. @@ -4104,6 +4138,9 @@ static int tcp_prune_queue(struct sock *sk);
  3068. static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
  3069. unsigned int size)
  3070. {
  3071. + if (tcp_sk(sk)->mpc)
  3072. + sk = mptcp_meta_sk(sk);
  3073. +
  3074. if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
  3075. !sk_rmem_schedule(sk, skb, size)) {
  3076. @@ -4134,15 +4171,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
  3077. * Better try to coalesce them right now to avoid future collapses.
  3078. * Returns true if caller should free @from instead of queueing it
  3079. */
  3080. -static bool tcp_try_coalesce(struct sock *sk,
  3081. - struct sk_buff *to,
  3082. - struct sk_buff *from,
  3083. - bool *fragstolen)
  3084. +bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,
  3085. + bool *fragstolen)
  3086. {
  3087. int delta;
  3088. *fragstolen = false;
  3089. + if (tcp_sk(sk)->mpc && !is_meta_sk(sk))
  3090. + return false;
  3091. +
  3092. if (tcp_hdr(from)->fin)
  3093. return false;
  3094. @@ -4232,7 +4270,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  3095. /* Do skb overlap to previous one? */
  3096. if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
  3097. - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
  3098. + /* MPTCP allows non-data data-fin to be in the ofo-queue */
  3099. + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
  3100. + !(tp->mpc && end_seq == seq)) {
  3101. /* All the bits are present. Drop. */
  3102. NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
  3103. __kfree_skb(skb);
  3104. @@ -4270,6 +4310,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  3105. end_seq);
  3106. break;
  3107. }
  3108. + /* MPTCP allows non-data data-fin to be in the ofo-queue */
  3109. + if (tp->mpc && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)
  3110. + continue;
  3111. __skb_unlink(skb1, &tp->out_of_order_queue);
  3112. tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
  3113. TCP_SKB_CB(skb1)->end_seq);
  3114. @@ -4287,8 +4330,8 @@ end:
  3115. }
  3116. }
  3117. -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
  3118. - bool *fragstolen)
  3119. +int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
  3120. + bool *fragstolen)
  3121. {
  3122. int eaten;
  3123. struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
  3124. @@ -4350,7 +4393,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
  3125. int eaten = -1;
  3126. bool fragstolen = false;
  3127. - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
  3128. + /* If no data is present, but a data_fin is in the options, we still
  3129. + * have to call mptcp_queue_skb later on. */
  3130. + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
  3131. + !(tp->mpc && mptcp_is_data_fin(skb)))
  3132. goto drop;
  3133. skb_dst_drop(skb);
  3134. @@ -4396,7 +4442,7 @@ queue_and_out:
  3135. eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
  3136. }
  3137. tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
  3138. - if (skb->len)
  3139. + if (skb->len || mptcp_is_data_fin(skb))
  3140. tcp_event_data_recv(sk, skb);
  3141. if (th->fin)
  3142. tcp_fin(sk);
  3143. @@ -4418,7 +4464,11 @@ queue_and_out:
  3144. if (eaten > 0)
  3145. kfree_skb_partial(skb, fragstolen);
  3146. - if (!sock_flag(sk, SOCK_DEAD))
  3147. + if (!sock_flag(sk, SOCK_DEAD) || tp->mpc)
  3148. + /* MPTCP: we always have to call data_ready, because
  3149. + * we may be about to receive a data-fin, which still
  3150. + * must get queued.
  3151. + */
  3152. sk->sk_data_ready(sk, 0);
  3153. return;
  3154. }
  3155. @@ -4470,6 +4520,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
  3156. next = skb_queue_next(list, skb);
  3157. __skb_unlink(skb, list);
  3158. + if (tcp_sk(sk)->mpc)
  3159. + mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);
  3160. __kfree_skb(skb);
  3161. NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
  3162. @@ -4642,6 +4694,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
  3163. struct tcp_sock *tp = tcp_sk(sk);
  3164. bool res = false;
  3165. + if (is_meta_sk(sk)) {
  3166. + if (!skb_queue_empty(&tp->out_of_order_queue)) {
  3167. + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
  3168. + mptcp_purge_ofo_queue(tp);
  3169. +
  3170. + /* No sack at the mptcp-level */
  3171. + sk_mem_reclaim(sk);
  3172. + res = true;
  3173. + }
  3174. + return res;
  3175. + }
  3176. +
  3177. if (!skb_queue_empty(&tp->out_of_order_queue)) {
  3178. NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
  3179. __skb_queue_purge(&tp->out_of_order_queue);
  3180. @@ -4731,7 +4795,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
  3181. tp->snd_cwnd_stamp = tcp_time_stamp;
  3182. }
  3183. -static bool tcp_should_expand_sndbuf(const struct sock *sk)
  3184. +bool tcp_should_expand_sndbuf(const struct sock *sk)
  3185. {
  3186. const struct tcp_sock *tp = tcp_sk(sk);
  3187. @@ -4766,7 +4830,7 @@ static void tcp_new_space(struct sock *sk)
  3188. {
  3189. struct tcp_sock *tp = tcp_sk(sk);
  3190. - if (tcp_should_expand_sndbuf(sk)) {
  3191. + if (tp->should_expand_sndbuf(sk)) {
  3192. tcp_sndbuf_expand(sk);
  3193. tp->snd_cwnd_stamp = tcp_time_stamp;
  3194. }
  3195. @@ -4778,8 +4842,9 @@ static void tcp_check_space(struct sock *sk)
  3196. {
  3197. if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
  3198. sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
  3199. - if (sk->sk_socket &&
  3200. - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
  3201. + if (tcp_sk(sk)->mpc ||
  3202. + (sk->sk_socket &&
  3203. + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))
  3204. tcp_new_space(sk);
  3205. }
  3206. }
  3207. @@ -4802,7 +4867,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
  3208. /* ... and right edge of window advances far enough.
  3209. * (tcp_recvmsg() will send ACK otherwise). Or...
  3210. */
  3211. - __tcp_select_window(sk) >= tp->rcv_wnd) ||
  3212. + tp->__select_window(sk) >= tp->rcv_wnd) ||
  3213. /* We ACK each frame or... */
  3214. tcp_in_quickack_mode(sk) ||
  3215. /* We have out of order data. */
  3216. @@ -4904,6 +4969,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
  3217. {
  3218. struct tcp_sock *tp = tcp_sk(sk);
  3219. + /* MPTCP urgent data is not yet supported */
  3220. + if (tp->mpc)
  3221. + return;
  3222. +
  3223. /* Check if we get a new urgent pointer - normally not. */
  3224. if (th->urg)
  3225. tcp_check_urg(sk, th);
  3226. @@ -4971,8 +5040,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
  3227. }
  3228. #ifdef CONFIG_NET_DMA
  3229. -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
  3230. - int hlen)
  3231. +bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
  3232. {
  3233. struct tcp_sock *tp = tcp_sk(sk);
  3234. int chunk = skb->len - hlen;
  3235. @@ -5081,9 +5149,15 @@ syn_challenge:
  3236. goto discard;
  3237. }
  3238. + /* If valid: post process the received MPTCP options. */
  3239. + if (tp->mpc && mptcp_handle_options(sk, th, skb))
  3240. + goto discard;
  3241. +
  3242. return true;
  3243. discard:
  3244. + if (tp->mpc)
  3245. + mptcp_reset_mopt(tp);
  3246. __kfree_skb(skb);
  3247. return false;
  3248. }
  3249. @@ -5135,6 +5209,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
  3250. tp->rx_opt.saw_tstamp = 0;
  3251. + /* MPTCP: force slowpath. */
  3252. + if (tp->mpc)
  3253. + goto slow_path;
  3254. +
  3255. /* pred_flags is 0xS?10 << 16 + snd_wnd
  3256. * if header_prediction is to be made
  3257. * 'S' will always be tp->tcp_header_len >> 2
  3258. @@ -5349,7 +5427,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
  3259. */
  3260. tp->lsndtime = tcp_time_stamp;
  3261. - tcp_init_buffer_space(sk);
  3262. + tp->init_buffer_space(sk);
  3263. if (sock_flag(sk, SOCK_KEEPOPEN))
  3264. inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
  3265. @@ -5379,7 +5457,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
  3266. /* Get original SYNACK MSS value if user MSS sets mss_clamp */
  3267. tcp_clear_options(&opt);
  3268. opt.user_mss = opt.mss_clamp = 0;
  3269. - tcp_parse_options(synack, &opt, 0, NULL);
  3270. + tcp_parse_options(synack, &opt, NULL, 0, NULL);
  3271. mss = opt.mss_clamp;
  3272. }
  3273. @@ -5414,8 +5492,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  3274. struct tcp_sock *tp = tcp_sk(sk);
  3275. struct tcp_fastopen_cookie foc = { .len = -1 };
  3276. int saved_clamp = tp->rx_opt.mss_clamp;
  3277. + struct mptcp_options_received mopt;
  3278. + mptcp_init_mp_opt(&mopt);
  3279. - tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
  3280. + tcp_parse_options(skb, &tp->rx_opt,
  3281. + tp->mpc ? &tp->mptcp->rx_opt : &mopt, 0, &foc);
  3282. if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
  3283. tp->rx_opt.rcv_tsecr -= tp->tsoffset;
  3284. @@ -5462,6 +5543,21 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  3285. if (!th->syn)
  3286. goto discard_and_undo;
  3287. + if (tp->request_mptcp || tp->mpc) {
  3288. + int ret;
  3289. + ret = mptcp_rcv_synsent_state_process(sk, &sk,
  3290. + skb, &mopt);
  3291. +
  3292. + /* May have changed if we support MPTCP */
  3293. + tp = tcp_sk(sk);
  3294. + icsk = inet_csk(sk);
  3295. +
  3296. + if (ret == 1)
  3297. + goto reset_and_undo;
  3298. + if (ret == 2)
  3299. + goto discard;
  3300. + }
  3301. +
  3302. /* rfc793:
  3303. * "If the SYN bit is on ...
  3304. * are acceptable then ...
  3305. @@ -5474,6 +5570,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  3306. tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
  3307. tcp_ack(sk, skb, FLAG_SLOWPATH);
  3308. + if (tp->mpc && !is_master_tp(tp)) {
  3309. + /* Timer for repeating the ACK until an answer
  3310. + * arrives. Used only when establishing an additional
  3311. + * subflow inside of an MPTCP connection.
  3312. + */
  3313. + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
  3314. + jiffies + icsk->icsk_rto);
  3315. + }
  3316. +
  3317. /* Ok.. it's good. Set up sequence numbers and
  3318. * move to established.
  3319. */
  3320. @@ -5500,6 +5605,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  3321. tp->tcp_header_len = sizeof(struct tcphdr);
  3322. }
  3323. + if (tp->mpc) {
  3324. + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
  3325. + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
  3326. + }
  3327. +
  3328. if (tcp_is_sack(tp) && sysctl_tcp_fack)
  3329. tcp_enable_fack(tp);
  3330. @@ -5520,7 +5630,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  3331. tcp_rcv_fastopen_synack(sk, skb, &foc))
  3332. return -1;
  3333. - if (sk->sk_write_pending ||
  3334. + /* With MPTCP we cannot send data on the third ack due to the
  3335. + * lack of option-space */
  3336. + if ((sk->sk_write_pending && !tp->mpc) ||
  3337. icsk->icsk_accept_queue.rskq_defer_accept ||
  3338. icsk->icsk_ack.pingpong) {
  3339. /* Save one ACK. Data will be ready after
  3340. @@ -5562,6 +5674,7 @@ discard:
  3341. tcp_paws_reject(&tp->rx_opt, 0))
  3342. goto discard_and_undo;
  3343. + /* TODO - check this here for MPTCP */
  3344. if (th->syn) {
  3345. /* We see SYN without ACK. It is attempt of
  3346. * simultaneous connect with crossed SYNs.
  3347. @@ -5578,6 +5691,11 @@ discard:
  3348. tp->tcp_header_len = sizeof(struct tcphdr);
  3349. }
  3350. + if (tp->mpc) {
  3351. + tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
  3352. + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
  3353. + }
  3354. +
  3355. tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
  3356. tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
  3357. @@ -5636,6 +5754,7 @@ reset_and_undo:
  3358. int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3359. const struct tcphdr *th, unsigned int len)
  3360. + __releases(&sk->sk_lock.slock)
  3361. {
  3362. struct tcp_sock *tp = tcp_sk(sk);
  3363. struct inet_connection_sock *icsk = inet_csk(sk);
  3364. @@ -5687,6 +5806,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3365. case TCP_SYN_SENT:
  3366. queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
  3367. + if (is_meta_sk(sk)) {
  3368. + sk = tcp_sk(sk)->mpcb->master_sk;
  3369. + tp = tcp_sk(sk);
  3370. + }
  3371. if (queued >= 0)
  3372. return queued;
  3373. @@ -5694,6 +5817,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3374. tcp_urg(sk, skb, th);
  3375. __kfree_skb(skb);
  3376. tcp_data_snd_check(sk);
  3377. + if (tp->mpc && is_master_tp(tp))
  3378. + bh_unlock_sock(sk);
  3379. return 0;
  3380. }
  3381. @@ -5736,7 +5861,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3382. tcp_mtup_init(sk);
  3383. tp->copied_seq = tp->rcv_nxt;
  3384. - tcp_init_buffer_space(sk);
  3385. + tp->init_buffer_space(sk);
  3386. }
  3387. smp_mb();
  3388. tcp_set_state(sk, TCP_ESTABLISHED);
  3389. @@ -5756,6 +5881,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3390. if (tp->rx_opt.tstamp_ok)
  3391. tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
  3392. + if (tp->mpc)
  3393. + tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
  3394. if (req) {
  3395. /* Re-arm the timer because data may have been sent out.
  3396. @@ -5777,6 +5904,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3397. tcp_initialize_rcv_mss(sk);
  3398. tcp_fast_path_on(tp);
  3399. + /* Send an ACK when establishing a new
  3400. + * MPTCP subflow, i.e. using an MP_JOIN
  3401. + * subtype.
  3402. + */
  3403. + if (tp->mpc && !is_master_tp(tp))
  3404. + tcp_send_ack(sk);
  3405. break;
  3406. case TCP_FIN_WAIT1: {
  3407. @@ -5828,7 +5961,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3408. tmo = tcp_fin_time(sk);
  3409. if (tmo > TCP_TIMEWAIT_LEN) {
  3410. inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
  3411. - } else if (th->fin || sock_owned_by_user(sk)) {
  3412. + } else if (th->fin || mptcp_is_data_fin(skb) ||
  3413. + sock_owned_by_user(sk)) {
  3414. /* Bad case. We could lose such FIN otherwise.
  3415. * It is not a big problem, but it looks confusing
  3416. * and not so rare event. We still can lose it now,
  3417. @@ -5857,6 +5991,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3418. goto discard;
  3419. }
  3420. break;
  3421. + case TCP_CLOSE:
  3422. + if (tp->mp_killed)
  3423. + goto discard;
  3424. }
  3425. /* step 6: check the URG bit */
  3426. @@ -5877,7 +6014,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  3427. */
  3428. if (sk->sk_shutdown & RCV_SHUTDOWN) {
  3429. if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
  3430. - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
  3431. + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
  3432. + !tp->mpc) {
  3433. + /* In case of mptcp, the reset is handled by
  3434. + * mptcp_rcv_state_process
  3435. + */
  3436. NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
  3437. tcp_reset(sk);
  3438. return 1;
  3439. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
  3440. index 1e4eac7..5891fa6 100644
  3441. --- a/net/ipv4/tcp_ipv4.c
  3442. +++ b/net/ipv4/tcp_ipv4.c
  3443. @@ -67,6 +67,8 @@
  3444. #include <net/icmp.h>
  3445. #include <net/inet_hashtables.h>
  3446. #include <net/tcp.h>
  3447. +#include <net/mptcp.h>
  3448. +#include <net/mptcp_v4.h>
  3449. #include <net/transp_v6.h>
  3450. #include <net/ipv6.h>
  3451. #include <net/inet_common.h>
  3452. @@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  3453. struct inet_hashinfo tcp_hashinfo;
  3454. EXPORT_SYMBOL(tcp_hashinfo);
  3455. -static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
  3456. +__u32 tcp_v4_init_sequence(const struct sk_buff *skb)
  3457. {
  3458. return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
  3459. ip_hdr(skb)->saddr,
  3460. @@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3461. struct inet_sock *inet;
  3462. const int type = icmp_hdr(icmp_skb)->type;
  3463. const int code = icmp_hdr(icmp_skb)->code;
  3464. - struct sock *sk;
  3465. + struct sock *sk, *meta_sk;
  3466. struct sk_buff *skb;
  3467. struct request_sock *req;
  3468. __u32 seq;
  3469. @@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3470. return;
  3471. }
  3472. - bh_lock_sock(sk);
  3473. + tp = tcp_sk(sk);
  3474. + if (tp->mpc)
  3475. + meta_sk = mptcp_meta_sk(sk);
  3476. + else
  3477. + meta_sk = sk;
  3478. +
  3479. + bh_lock_sock(meta_sk);
  3480. /* If too many ICMPs get dropped on busy
  3481. * servers this needs to be solved differently.
  3482. * We do take care of PMTU discovery (RFC1191) special case :
  3483. * we can receive locally generated ICMP messages while socket is held.
  3484. */
  3485. - if (sock_owned_by_user(sk)) {
  3486. + if (sock_owned_by_user(meta_sk)) {
  3487. if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
  3488. NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
  3489. }
  3490. @@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3491. }
  3492. icsk = inet_csk(sk);
  3493. - tp = tcp_sk(sk);
  3494. req = tp->fastopen_rsk;
  3495. seq = ntohl(th->seq);
  3496. if (sk->sk_state != TCP_LISTEN &&
  3497. @@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3498. goto out;
  3499. tp->mtu_info = info;
  3500. - if (!sock_owned_by_user(sk)) {
  3501. + if (!sock_owned_by_user(meta_sk)) {
  3502. tcp_v4_mtu_reduced(sk);
  3503. } else {
  3504. if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
  3505. sock_hold(sk);
  3506. + if (tp->mpc)
  3507. + mptcp_tsq_flags(sk);
  3508. }
  3509. goto out;
  3510. }
  3511. @@ -431,7 +440,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3512. /* XXX (TFO) - revisit the following logic for TFO */
  3513. - if (sock_owned_by_user(sk))
  3514. + if (sock_owned_by_user(meta_sk))
  3515. break;
  3516. icsk->icsk_backoff--;
  3517. @@ -473,7 +482,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3518. switch (sk->sk_state) {
  3519. struct request_sock *req, **prev;
  3520. case TCP_LISTEN:
  3521. - if (sock_owned_by_user(sk))
  3522. + if (sock_owned_by_user(meta_sk))
  3523. goto out;
  3524. req = inet_csk_search_req(sk, &prev, th->dest,
  3525. @@ -506,7 +515,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3526. It can f.e. if SYNs crossed,
  3527. or Fast Open.
  3528. */
  3529. - if (!sock_owned_by_user(sk)) {
  3530. + if (!sock_owned_by_user(meta_sk)) {
  3531. sk->sk_err = err;
  3532. sk->sk_error_report(sk);
  3533. @@ -535,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3534. */
  3535. inet = inet_sk(sk);
  3536. - if (!sock_owned_by_user(sk) && inet->recverr) {
  3537. + if (!sock_owned_by_user(meta_sk) && inet->recverr) {
  3538. sk->sk_err = err;
  3539. sk->sk_error_report(sk);
  3540. } else { /* Only an error on timeout */
  3541. @@ -543,7 +552,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
  3542. }
  3543. out:
  3544. - bh_unlock_sock(sk);
  3545. + bh_unlock_sock(meta_sk);
  3546. sock_put(sk);
  3547. }
  3548. @@ -585,7 +594,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
  3549. * Exception: precedence violation. We do not implement it in any case.
  3550. */
  3551. -static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
  3552. +void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
  3553. {
  3554. const struct tcphdr *th = tcp_hdr(skb);
  3555. struct {
  3556. @@ -709,10 +718,10 @@ release_sk1:
  3557. outside socket context is ugly, certainly. What can I do?
  3558. */
  3559. -static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
  3560. +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
  3561. u32 win, u32 tsval, u32 tsecr, int oif,
  3562. struct tcp_md5sig_key *key,
  3563. - int reply_flags, u8 tos)
  3564. + int reply_flags, u8 tos, int mptcp)
  3565. {
  3566. const struct tcphdr *th = tcp_hdr(skb);
  3567. struct {
  3568. @@ -721,6 +730,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
  3569. #ifdef CONFIG_TCP_MD5SIG
  3570. + (TCPOLEN_MD5SIG_ALIGNED >> 2)
  3571. #endif
  3572. +#ifdef CONFIG_MPTCP
  3573. + + ((MPTCP_SUB_LEN_DSS >> 2) +
  3574. + (MPTCP_SUB_LEN_ACK >> 2))
  3575. +#endif
  3576. ];
  3577. } rep;
  3578. struct ip_reply_arg arg;
  3579. @@ -765,6 +778,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
  3580. ip_hdr(skb)->daddr, &rep.th);
  3581. }
  3582. #endif
  3583. +#ifdef CONFIG_MPTCP
  3584. + if (mptcp) {
  3585. + int offset = (tsecr) ? 3 : 0;
  3586. + /* Construction of 32-bit data_ack */
  3587. + rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
  3588. + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
  3589. + (0x20 << 8) |
  3590. + (0x01));
  3591. + rep.opt[offset] = htonl(data_ack);
  3592. +
  3593. + arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
  3594. + rep.th.doff = arg.iov[0].iov_len / 4;
  3595. + }
  3596. +#endif /* CONFIG_MPTCP */
  3597. +
  3598. arg.flags = reply_flags;
  3599. arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
  3600. ip_hdr(skb)->saddr, /* XXX */
  3601. @@ -783,36 +811,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
  3602. {
  3603. struct inet_timewait_sock *tw = inet_twsk(sk);
  3604. struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
  3605. + u32 data_ack = 0;
  3606. + int mptcp = 0;
  3607. +
  3608. + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
  3609. + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
  3610. + mptcp = 1;
  3611. + }
  3612. tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
  3613. + data_ack,
  3614. tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
  3615. tcp_time_stamp + tcptw->tw_ts_offset,
  3616. tcptw->tw_ts_recent,
  3617. tw->tw_bound_dev_if,
  3618. tcp_twsk_md5_key(tcptw),
  3619. tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
  3620. - tw->tw_tos
  3621. + tw->tw_tos, mptcp
  3622. );
  3623. inet_twsk_put(tw);
  3624. }
  3625. -static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  3626. - struct request_sock *req)
  3627. +void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  3628. + struct request_sock *req)
  3629. {
  3630. /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
  3631. * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
  3632. */
  3633. tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
  3634. tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
  3635. - tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
  3636. + tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,
  3637. tcp_time_stamp,
  3638. req->ts_recent,
  3639. 0,
  3640. tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
  3641. AF_INET),
  3642. inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
  3643. - ip_hdr(skb)->tos);
  3644. + ip_hdr(skb)->tos, 0);
  3645. }
  3646. /*
  3647. @@ -820,9 +856,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  3648. * This still operates on a request_sock only, not on a big
  3649. * socket.
  3650. */
  3651. -static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
  3652. - struct request_sock *req,
  3653. - u16 queue_mapping)
  3654. +int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
  3655. + struct request_sock *req,
  3656. + u16 queue_mapping)
  3657. {
  3658. const struct inet_request_sock *ireq = inet_rsk(req);
  3659. struct flowi4 fl4;
  3660. @@ -850,7 +886,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
  3661. return err;
  3662. }
  3663. -static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
  3664. +int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
  3665. {
  3666. int res = tcp_v4_send_synack(sk, NULL, req, 0);
  3667. @@ -862,7 +898,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
  3668. /*
  3669. * IPv4 request_sock destructor.
  3670. */
  3671. -static void tcp_v4_reqsk_destructor(struct request_sock *req)
  3672. +void tcp_v4_reqsk_destructor(struct request_sock *req)
  3673. {
  3674. kfree(inet_rsk(req)->opt);
  3675. }
  3676. @@ -902,7 +938,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
  3677. /*
  3678. * Save and compile IPv4 options into the request_sock if needed.
  3679. */
  3680. -static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
  3681. +struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
  3682. {
  3683. const struct ip_options *opt = &(IPCB(skb)->opt);
  3684. struct ip_options_rcu *dopt = NULL;
  3685. @@ -1254,7 +1290,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
  3686. };
  3687. #ifdef CONFIG_TCP_MD5SIG
  3688. -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
  3689. +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
  3690. .md5_lookup = tcp_v4_reqsk_md5_lookup,
  3691. .calc_md5_hash = tcp_v4_md5_hash_skb,
  3692. };
  3693. @@ -1412,7 +1448,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
  3694. tcp_init_congestion_control(child);
  3695. tcp_mtup_init(child);
  3696. tcp_init_metrics(child);
  3697. - tcp_init_buffer_space(child);
  3698. + tp->init_buffer_space(child);
  3699. /* Queue the data carried in the SYN packet. We need to first
  3700. * bump skb's refcnt because the caller will attempt to free it.
  3701. @@ -1444,6 +1480,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
  3702. int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  3703. {
  3704. struct tcp_options_received tmp_opt;
  3705. + struct mptcp_options_received mopt;
  3706. struct request_sock *req;
  3707. struct inet_request_sock *ireq;
  3708. struct tcp_sock *tp = tcp_sk(sk);
  3709. @@ -1458,6 +1495,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  3710. struct sk_buff *skb_synack;
  3711. int do_fastopen;
  3712. + tcp_clear_options(&tmp_opt);
  3713. + tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
  3714. + tmp_opt.user_mss = tp->rx_opt.user_mss;
  3715. + mptcp_init_mp_opt(&mopt);
  3716. + tcp_parse_options(skb, &tmp_opt, &mopt, 0, want_cookie ? NULL : &foc);
  3717. +
  3718. +#ifdef CONFIG_MPTCP
  3719. + /* MPTCP structures not initialized, so clear MPTCP fields */
  3720. + if (mptcp_init_failed)
  3721. + mptcp_init_mp_opt(&mopt);
  3722. +
  3723. + if (mopt.is_mp_join)
  3724. + return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk));
  3725. + if (mopt.drop_me)
  3726. + goto drop;
  3727. +#endif
  3728. /* Never answer to SYNs send to broadcast or multicast */
  3729. if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
  3730. goto drop;
  3731. @@ -1483,7 +1536,22 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  3732. goto drop;
  3733. }
  3734. - req = inet_reqsk_alloc(&tcp_request_sock_ops);
  3735. +#ifdef CONFIG_MPTCP
  3736. + if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
  3737. + mopt.saw_mpc = 0;
  3738. + if (mopt.saw_mpc && !want_cookie) {
  3739. + req = inet_reqsk_alloc(&mptcp_request_sock_ops);
  3740. +
  3741. + if (!req)
  3742. + goto drop;
  3743. +
  3744. + mptcp_rsk(req)->mpcb = NULL;
  3745. + mptcp_rsk(req)->dss_csum = mopt.dss_csum;
  3746. + mptcp_rsk(req)->collide_tk.pprev = NULL;
  3747. + } else
  3748. +#endif
  3749. + req = inet_reqsk_alloc(&tcp_request_sock_ops);
  3750. +
  3751. if (!req)
  3752. goto drop;
  3753. @@ -1491,17 +1559,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  3754. tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
  3755. #endif
  3756. - tcp_clear_options(&tmp_opt);
  3757. - tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
  3758. - tmp_opt.user_mss = tp->rx_opt.user_mss;
  3759. - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
  3760. -
  3761. if (want_cookie && !tmp_opt.saw_tstamp)
  3762. tcp_clear_options(&tmp_opt);
  3763. tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
  3764. tcp_openreq_init(req, &tmp_opt, skb);
  3765. + if (mopt.saw_mpc && !want_cookie)
  3766. + mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb);
  3767. +
  3768. ireq = inet_rsk(req);
  3769. ireq->ir_loc_addr = daddr;
  3770. ireq->ir_rmt_addr = saddr;
  3771. @@ -1713,7 +1779,7 @@ put_and_exit:
  3772. }
  3773. EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
  3774. -static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
  3775. +struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
  3776. {
  3777. struct tcphdr *th = tcp_hdr(skb);
  3778. const struct iphdr *iph = ip_hdr(skb);
  3779. @@ -1730,8 +1796,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
  3780. if (nsk) {
  3781. if (nsk->sk_state != TCP_TIME_WAIT) {
  3782. + /* Don't lock again the meta-sk. It has been locked
  3783. + * before mptcp_v4_do_rcv.
  3784. + */
  3785. + if (tcp_sk(nsk)->mpc && !is_meta_sk(sk))
  3786. + bh_lock_sock(mptcp_meta_sk(nsk));
  3787. bh_lock_sock(nsk);
  3788. +
  3789. return nsk;
  3790. +
  3791. }
  3792. inet_twsk_put(inet_twsk(nsk));
  3793. return NULL;
  3794. @@ -1788,6 +1861,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
  3795. goto discard;
  3796. #endif
  3797. + if (is_meta_sk(sk))
  3798. + return mptcp_v4_do_rcv(sk, skb);
  3799. +
  3800. if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
  3801. struct dst_entry *dst = sk->sk_rx_dst;
  3802. @@ -1919,7 +1995,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
  3803. } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
  3804. wake_up_interruptible_sync_poll(sk_sleep(sk),
  3805. POLLIN | POLLRDNORM | POLLRDBAND);
  3806. - if (!inet_csk_ack_scheduled(sk))
  3807. + if (!inet_csk_ack_scheduled(sk) && !tp->mpc)
  3808. inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
  3809. (3 * tcp_rto_min(sk)) / 4,
  3810. TCP_RTO_MAX);
  3811. @@ -1936,7 +2012,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
  3812. {
  3813. const struct iphdr *iph;
  3814. const struct tcphdr *th;
  3815. - struct sock *sk;
  3816. + struct sock *sk, *meta_sk = NULL;
  3817. int ret;
  3818. struct net *net = dev_net(skb->dev);
  3819. @@ -1969,18 +2045,42 @@ int tcp_v4_rcv(struct sk_buff *skb)
  3820. TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
  3821. skb->len - th->doff * 4);
  3822. TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
  3823. +#ifdef CONFIG_MPTCP
  3824. + TCP_SKB_CB(skb)->mptcp_flags = 0;
  3825. + TCP_SKB_CB(skb)->dss_off = 0;
  3826. +#endif
  3827. TCP_SKB_CB(skb)->when = 0;
  3828. TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
  3829. TCP_SKB_CB(skb)->sacked = 0;
  3830. sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
  3831. - if (!sk)
  3832. - goto no_tcp_socket;
  3833. process:
  3834. - if (sk->sk_state == TCP_TIME_WAIT)
  3835. + if (sk && sk->sk_state == TCP_TIME_WAIT)
  3836. goto do_time_wait;
  3837. +#ifdef CONFIG_MPTCP
  3838. + if (!sk && th->syn && !th->ack) {
  3839. + int ret = mptcp_lookup_join(skb, NULL);
  3840. +
  3841. + if (ret < 0) {
  3842. + tcp_v4_send_reset(NULL, skb);
  3843. + goto discard_it;
  3844. + } else if (ret > 0) {
  3845. + return 0;
  3846. + }
  3847. + }
  3848. +
  3849. + /* Is there a pending request sock for this segment ? */
  3850. + if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
  3851. + if (sk)
  3852. + sock_put(sk);
  3853. + return 0;
  3854. + }
  3855. +#endif
  3856. + if (!sk)
  3857. + goto no_tcp_socket;
  3858. +
  3859. if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
  3860. NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
  3861. goto discard_and_relse;
  3862. @@ -1996,11 +2096,21 @@ process:
  3863. sk_mark_napi_id(sk, skb);
  3864. skb->dev = NULL;
  3865. - bh_lock_sock_nested(sk);
  3866. + if (tcp_sk(sk)->mpc) {
  3867. + meta_sk = mptcp_meta_sk(sk);
  3868. +
  3869. + bh_lock_sock_nested(meta_sk);
  3870. + if (sock_owned_by_user(meta_sk))
  3871. + skb->sk = sk;
  3872. + } else {
  3873. + meta_sk = sk;
  3874. + bh_lock_sock_nested(sk);
  3875. + }
  3876. +
  3877. ret = 0;
  3878. - if (!sock_owned_by_user(sk)) {
  3879. + if (!sock_owned_by_user(meta_sk)) {
  3880. #ifdef CONFIG_NET_DMA
  3881. - struct tcp_sock *tp = tcp_sk(sk);
  3882. + struct tcp_sock *tp = tcp_sk(meta_sk);
  3883. if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
  3884. tp->ucopy.dma_chan = net_dma_find_channel();
  3885. if (tp->ucopy.dma_chan)
  3886. @@ -2008,16 +2118,16 @@ process:
  3887. else
  3888. #endif
  3889. {
  3890. - if (!tcp_prequeue(sk, skb))
  3891. + if (!tcp_prequeue(meta_sk, skb))
  3892. ret = tcp_v4_do_rcv(sk, skb);
  3893. }
  3894. - } else if (unlikely(sk_add_backlog(sk, skb,
  3895. - sk->sk_rcvbuf + sk->sk_sndbuf))) {
  3896. - bh_unlock_sock(sk);
  3897. + } else if (unlikely(sk_add_backlog(meta_sk, skb,
  3898. + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
  3899. + bh_unlock_sock(meta_sk);
  3900. NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  3901. goto discard_and_relse;
  3902. }
  3903. - bh_unlock_sock(sk);
  3904. + bh_unlock_sock(meta_sk);
  3905. sock_put(sk);
  3906. @@ -2072,6 +2182,18 @@ do_time_wait:
  3907. sk = sk2;
  3908. goto process;
  3909. }
  3910. +#ifdef CONFIG_MPTCP
  3911. + if (th->syn && !th->ack) {
  3912. + int ret = mptcp_lookup_join(skb, inet_twsk(sk));
  3913. +
  3914. + if (ret < 0) {
  3915. + tcp_v4_send_reset(NULL, skb);
  3916. + goto discard_it;
  3917. + } else if (ret > 0) {
  3918. + return 0;
  3919. + }
  3920. + }
  3921. +#endif
  3922. /* Fall through to ACK */
  3923. }
  3924. case TCP_TW_ACK:
  3925. @@ -2154,6 +2276,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
  3926. tcp_cleanup_congestion_control(sk);
  3927. + if (tp->mpc)
  3928. + mptcp_destroy_sock(sk);
  3929. + if (tp->inside_tk_table)
  3930. + mptcp_hash_remove(tp);
  3931. +
  3932. /* Cleanup up the write buffer. */
  3933. tcp_write_queue_purge(sk);
  3934. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
  3935. index 7a436c5..72f9b8e 100644
  3936. --- a/net/ipv4/tcp_minisocks.c
  3937. +++ b/net/ipv4/tcp_minisocks.c
  3938. @@ -18,11 +18,13 @@
  3939. * Jorge Cwik, <jorge@laser.satlink.net>
  3940. */
  3941. +#include <linux/kconfig.h>
  3942. #include <linux/mm.h>
  3943. #include <linux/module.h>
  3944. #include <linux/slab.h>
  3945. #include <linux/sysctl.h>
  3946. #include <linux/workqueue.h>
  3947. +#include <net/mptcp.h>
  3948. #include <net/tcp.h>
  3949. #include <net/inet_common.h>
  3950. #include <net/xfrm.h>
  3951. @@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  3952. struct tcp_options_received tmp_opt;
  3953. struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
  3954. bool paws_reject = false;
  3955. + struct mptcp_options_received mopt;
  3956. tmp_opt.saw_tstamp = 0;
  3957. if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
  3958. - tcp_parse_options(skb, &tmp_opt, 0, NULL);
  3959. + mptcp_init_mp_opt(&mopt);
  3960. +
  3961. + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
  3962. if (tmp_opt.saw_tstamp) {
  3963. tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
  3964. @@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  3965. tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
  3966. paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
  3967. }
  3968. +
  3969. + if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
  3970. + if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)
  3971. + goto kill_with_rst;
  3972. + }
  3973. }
  3974. if (tw->tw_substate == TCP_FIN_WAIT2) {
  3975. @@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  3976. if (!th->ack ||
  3977. !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
  3978. TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
  3979. + /* If mptcp_is_data_fin() returns true, we are sure that
  3980. + * mopt has been initialized - otherwise it would not
  3981. + * be a DATA_FIN.
  3982. + */
  3983. + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
  3984. + mptcp_is_data_fin(skb) &&
  3985. + TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
  3986. + mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
  3987. + return TCP_TW_ACK;
  3988. +
  3989. inet_twsk_put(tw);
  3990. return TCP_TW_SUCCESS;
  3991. }
  3992. @@ -270,6 +290,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
  3993. const struct tcp_sock *tp = tcp_sk(sk);
  3994. bool recycle_ok = false;
  3995. + if (is_meta_sk(sk)) {
  3996. + mptcp_update_tw_socks(tp, state);
  3997. + goto tcp_done;
  3998. + }
  3999. +
  4000. if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
  4001. recycle_ok = tcp_remember_stamp(sk);
  4002. @@ -290,6 +315,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
  4003. tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
  4004. tcptw->tw_ts_offset = tp->tsoffset;
  4005. + if (tp->mpc) {
  4006. + if (mptcp_time_wait(sk, tcptw)) {
  4007. + inet_twsk_free(tw);
  4008. + goto exit;
  4009. + }
  4010. + } else {
  4011. + tcptw->mptcp_tw = NULL;
  4012. + }
  4013. +
  4014. #if IS_ENABLED(CONFIG_IPV6)
  4015. if (tw->tw_family == PF_INET6) {
  4016. struct ipv6_pinfo *np = inet6_sk(sk);
  4017. @@ -347,15 +381,19 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
  4018. NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
  4019. }
  4020. +exit:
  4021. tcp_update_metrics(sk);
  4022. +tcp_done:
  4023. tcp_done(sk);
  4024. }
  4025. void tcp_twsk_destructor(struct sock *sk)
  4026. {
  4027. -#ifdef CONFIG_TCP_MD5SIG
  4028. struct tcp_timewait_sock *twsk = tcp_twsk(sk);
  4029. + if (twsk->mptcp_tw)
  4030. + mptcp_twsk_destructor(twsk);
  4031. +#ifdef CONFIG_TCP_MD5SIG
  4032. if (twsk->tw_md5_key)
  4033. kfree_rcu(twsk->tw_md5_key, rcu);
  4034. #endif
  4035. @@ -392,6 +430,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
  4036. newtp->snd_sml = newtp->snd_una =
  4037. newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
  4038. +#ifdef CONFIG_MPTCP
  4039. + memset(&newtp->rcvq_space, 0, sizeof(newtp->rcvq_space));
  4040. +#endif
  4041. tcp_prequeue_init(newtp);
  4042. INIT_LIST_HEAD(&newtp->tsq_node);
  4043. @@ -436,7 +477,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
  4044. newtp->urg_data = 0;
  4045. - if (sock_flag(newsk, SOCK_KEEPOPEN))
  4046. + /* MPTCP: If we are creating a subflow, KEEPOPEN might have been
  4047. + * set on the meta. But, keepalive is entirely handled at the
  4048. + * meta-socket, so let's keep it there.
  4049. + */
  4050. + if (sock_flag(newsk, SOCK_KEEPOPEN) && is_meta_sk(sk))
  4051. inet_csk_reset_keepalive_timer(newsk,
  4052. keepalive_time_when(newtp));
  4053. @@ -468,6 +513,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
  4054. newtp->rx_opt.ts_recent_stamp = 0;
  4055. newtp->tcp_header_len = sizeof(struct tcphdr);
  4056. }
  4057. + if (treq->saw_mpc)
  4058. + newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
  4059. newtp->tsoffset = 0;
  4060. #ifdef CONFIG_TCP_MD5SIG
  4061. newtp->md5sig_info = NULL; /*XXX*/
  4062. @@ -504,16 +551,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  4063. bool fastopen)
  4064. {
  4065. struct tcp_options_received tmp_opt;
  4066. + struct mptcp_options_received mopt;
  4067. struct sock *child;
  4068. const struct tcphdr *th = tcp_hdr(skb);
  4069. __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
  4070. bool paws_reject = false;
  4071. - BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
  4072. + BUG_ON(!tcp_sk(sk)->mpc && fastopen == (sk->sk_state == TCP_LISTEN));
  4073. tmp_opt.saw_tstamp = 0;
  4074. +
  4075. + mptcp_init_mp_opt(&mopt);
  4076. +
  4077. if (th->doff > (sizeof(struct tcphdr)>>2)) {
  4078. - tcp_parse_options(skb, &tmp_opt, 0, NULL);
  4079. + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
  4080. if (tmp_opt.saw_tstamp) {
  4081. tmp_opt.ts_recent = req->ts_recent;
  4082. @@ -552,7 +603,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  4083. *
  4084. * Reset timer after retransmitting SYNACK, similar to
  4085. * the idea of fast retransmit in recovery.
  4086. + *
  4087. + * Fall back to TCP if MP_CAPABLE is not set.
  4088. */
  4089. +
  4090. + if (tcp_rsk(req)->saw_mpc && !mopt.saw_mpc)
  4091. + tcp_rsk(req)->saw_mpc = false;
  4092. +
  4093. +
  4094. if (!inet_rtx_syn_ack(sk, req))
  4095. req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
  4096. TCP_RTO_MAX) + jiffies;
  4097. @@ -674,7 +732,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  4098. /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
  4099. if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
  4100. - TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
  4101. + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1 &&
  4102. + /* TODO MPTCP:
  4103. + * We do this here, because otherwise options sent in the third ack,
  4104. + * or duplicate fourth ack will get lost. Options like MP_PRIO, ADD_ADDR,...
  4105. + *
  4106. + * We could store them in request_sock, but this would mean that we
  4107. + * have to put tcp_options_received and mptcp_options_received in there,
  4108. + * increasing considerably the size of the request-sock.
  4109. + *
  4110. + * As soon as we have reworked the request-sock MPTCP-fields and
  4111. + * created a mptcp_request_sock structure, we can handle options
  4112. + * correclty there without increasing request_sock.
  4113. + */
  4114. + !tcp_rsk(req)->saw_mpc) {
  4115. inet_rsk(req)->acked = 1;
  4116. NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
  4117. return NULL;
  4118. @@ -686,10 +757,29 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  4119. * ESTABLISHED STATE. If it will be dropped after
  4120. * socket is created, wait for troubles.
  4121. */
  4122. - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
  4123. +#ifdef CONFIG_MPTCP
  4124. + if (tcp_sk(sk)->mpc)
  4125. + /* MPTCP: We call the mptcp-specific syn_recv_sock */
  4126. + child = tcp_sk(sk)->mpcb->syn_recv_sock(sk, skb, req, NULL);
  4127. + else
  4128. +#endif
  4129. + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
  4130. + req, NULL);
  4131. +
  4132. if (child == NULL)
  4133. goto listen_overflow;
  4134. + if (!is_meta_sk(sk)) {
  4135. + int ret = mptcp_check_req_master(sk, child, req, prev, &mopt);
  4136. + if (ret < 0)
  4137. + goto listen_overflow;
  4138. +
  4139. + /* MPTCP-supported */
  4140. + if (!ret)
  4141. + return tcp_sk(child)->mpcb->master_sk;
  4142. + } else {
  4143. + return mptcp_check_req_child(sk, child, req, prev, &mopt);
  4144. + }
  4145. inet_csk_reqsk_queue_unlink(sk, req, prev);
  4146. inet_csk_reqsk_queue_removed(sk, req);
  4147. @@ -739,8 +829,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,
  4148. {
  4149. int ret = 0;
  4150. int state = child->sk_state;
  4151. + struct sock *meta_sk = tcp_sk(child)->mpc ? mptcp_meta_sk(child) : child;
  4152. - if (!sock_owned_by_user(child)) {
  4153. + if (!sock_owned_by_user(meta_sk)) {
  4154. ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
  4155. skb->len);
  4156. /* Wakeup parent, send SIGIO */
  4157. @@ -751,10 +842,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,
  4158. * in main socket hash table and lock on listening
  4159. * socket does not protect us more.
  4160. */
  4161. - __sk_add_backlog(child, skb);
  4162. + if (tcp_sk(child)->mpc)
  4163. + skb->sk = child;
  4164. + __sk_add_backlog(meta_sk, skb);
  4165. }
  4166. - bh_unlock_sock(child);
  4167. + if (tcp_sk(child)->mpc)
  4168. + bh_unlock_sock(child);
  4169. + bh_unlock_sock(meta_sk);
  4170. sock_put(child);
  4171. return ret;
  4172. }
  4173. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
  4174. index 17a11e6..6b45057 100644
  4175. --- a/net/ipv4/tcp_output.c
  4176. +++ b/net/ipv4/tcp_output.c
  4177. @@ -36,6 +36,12 @@
  4178. #define pr_fmt(fmt) "TCP: " fmt
  4179. +#include <net/mptcp.h>
  4180. +#include <net/mptcp_v4.h>
  4181. +#if IS_ENABLED(CONFIG_IPV6)
  4182. +#include <net/mptcp_v6.h>
  4183. +#endif
  4184. +#include <net/ipv6.h>
  4185. #include <net/tcp.h>
  4186. #include <linux/compiler.h>
  4187. @@ -72,7 +78,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  4188. int push_one, gfp_t gfp);
  4189. /* Account for new data that has been sent to the network. */
  4190. -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
  4191. +void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
  4192. {
  4193. struct inet_connection_sock *icsk = inet_csk(sk);
  4194. struct tcp_sock *tp = tcp_sk(sk);
  4195. @@ -211,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss)
  4196. void tcp_select_initial_window(int __space, __u32 mss,
  4197. __u32 *rcv_wnd, __u32 *window_clamp,
  4198. int wscale_ok, __u8 *rcv_wscale,
  4199. - __u32 init_rcv_wnd)
  4200. + __u32 init_rcv_wnd, const struct sock *sk)
  4201. {
  4202. unsigned int space = (__space < 0 ? 0 : __space);
  4203. @@ -266,11 +272,15 @@ EXPORT_SYMBOL(tcp_select_initial_window);
  4204. * value can be stuffed directly into th->window for an outgoing
  4205. * frame.
  4206. */
  4207. -static u16 tcp_select_window(struct sock *sk)
  4208. +u16 tcp_select_window(struct sock *sk)
  4209. {
  4210. struct tcp_sock *tp = tcp_sk(sk);
  4211. - u32 cur_win = tcp_receive_window(tp);
  4212. - u32 new_win = __tcp_select_window(sk);
  4213. + /* The window must never shrink at the meta-level. At the subflow we
  4214. + * have to allow this. Otherwise we may announce a window too large
  4215. + * for the current meta-level sk_rcvbuf.
  4216. + */
  4217. + u32 cur_win = tcp_receive_window(tp->mpc ? tcp_sk(mptcp_meta_sk(sk)) : tp);
  4218. + u32 new_win = tp->__select_window(sk);
  4219. /* Never shrink the offered window */
  4220. if (new_win < cur_win) {
  4221. @@ -283,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk)
  4222. */
  4223. new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
  4224. }
  4225. +
  4226. tp->rcv_wnd = new_win;
  4227. tp->rcv_wup = tp->rcv_nxt;
  4228. @@ -361,7 +372,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
  4229. /* Constructs common control bits of non-data skb. If SYN/FIN is present,
  4230. * auto increment end seqno.
  4231. */
  4232. -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  4233. +void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  4234. {
  4235. struct skb_shared_info *shinfo = skb_shinfo(skb);
  4236. @@ -381,7 +392,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  4237. TCP_SKB_CB(skb)->end_seq = seq;
  4238. }
  4239. -static inline bool tcp_urg_mode(const struct tcp_sock *tp)
  4240. +bool tcp_urg_mode(const struct tcp_sock *tp)
  4241. {
  4242. return tp->snd_una != tp->snd_up;
  4243. }
  4244. @@ -391,17 +402,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
  4245. #define OPTION_MD5 (1 << 2)
  4246. #define OPTION_WSCALE (1 << 3)
  4247. #define OPTION_FAST_OPEN_COOKIE (1 << 8)
  4248. -
  4249. -struct tcp_out_options {
  4250. - u16 options; /* bit field of OPTION_* */
  4251. - u16 mss; /* 0 to disable */
  4252. - u8 ws; /* window scale, 0 to disable */
  4253. - u8 num_sack_blocks; /* number of SACK blocks to include */
  4254. - u8 hash_size; /* bytes in hash_location */
  4255. - __u8 *hash_location; /* temporary pointer, overloaded */
  4256. - __u32 tsval, tsecr; /* need to include OPTION_TS */
  4257. - struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
  4258. -};
  4259. +/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
  4260. /* Write previously computed TCP options to the packet.
  4261. *
  4262. @@ -417,7 +418,7 @@ struct tcp_out_options {
  4263. * (but it may well be that other scenarios fail similarly).
  4264. */
  4265. static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
  4266. - struct tcp_out_options *opts)
  4267. + struct tcp_out_options *opts, struct sk_buff *skb)
  4268. {
  4269. u16 options = opts->options; /* mungable copy */
  4270. @@ -500,6 +501,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
  4271. }
  4272. ptr += (foc->len + 3) >> 2;
  4273. }
  4274. +
  4275. + if (unlikely(OPTION_MPTCP & opts->options))
  4276. + mptcp_options_write(ptr, tp, opts, skb);
  4277. }
  4278. /* Compute TCP options for SYN packets. This is not the final
  4279. @@ -551,6 +555,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
  4280. if (unlikely(!(OPTION_TS & opts->options)))
  4281. remaining -= TCPOLEN_SACKPERM_ALIGNED;
  4282. }
  4283. + if (tp->request_mptcp || tp->mpc)
  4284. + mptcp_syn_options(sk, opts, &remaining);
  4285. if (fastopen && fastopen->cookie.len >= 0) {
  4286. u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
  4287. @@ -624,6 +630,9 @@ static unsigned int tcp_synack_options(struct sock *sk,
  4288. }
  4289. }
  4290. + if (tcp_rsk(req)->saw_mpc)
  4291. + mptcp_synack_options(req, opts, &remaining);
  4292. +
  4293. return MAX_TCP_OPTION_SPACE - remaining;
  4294. }
  4295. @@ -657,16 +666,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
  4296. opts->tsecr = tp->rx_opt.ts_recent;
  4297. size += TCPOLEN_TSTAMP_ALIGNED;
  4298. }
  4299. + if (tp->mpc)
  4300. + mptcp_established_options(sk, skb, opts, &size);
  4301. eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
  4302. if (unlikely(eff_sacks)) {
  4303. - const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
  4304. - opts->num_sack_blocks =
  4305. - min_t(unsigned int, eff_sacks,
  4306. - (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
  4307. - TCPOLEN_SACK_PERBLOCK);
  4308. - size += TCPOLEN_SACK_BASE_ALIGNED +
  4309. - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
  4310. + const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
  4311. + if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
  4312. + opts->num_sack_blocks = 0;
  4313. + else
  4314. + opts->num_sack_blocks =
  4315. + min_t(unsigned int, eff_sacks,
  4316. + (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
  4317. + TCPOLEN_SACK_PERBLOCK);
  4318. + if (opts->num_sack_blocks)
  4319. + size += TCPOLEN_SACK_BASE_ALIGNED +
  4320. + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
  4321. }
  4322. return size;
  4323. @@ -714,7 +729,7 @@ static void tcp_tasklet_func(unsigned long data)
  4324. unsigned long flags;
  4325. struct list_head *q, *n;
  4326. struct tcp_sock *tp;
  4327. - struct sock *sk;
  4328. + struct sock *sk, *meta_sk;
  4329. local_irq_save(flags);
  4330. list_splice_init(&tsq->head, &list);
  4331. @@ -725,15 +740,27 @@ static void tcp_tasklet_func(unsigned long data)
  4332. list_del(&tp->tsq_node);
  4333. sk = (struct sock *)tp;
  4334. - bh_lock_sock(sk);
  4335. + meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  4336. + bh_lock_sock(meta_sk);
  4337. - if (!sock_owned_by_user(sk)) {
  4338. + if (!sock_owned_by_user(meta_sk)) {
  4339. tcp_tsq_handler(sk);
  4340. + if (tp->mpc)
  4341. + tcp_tsq_handler(meta_sk);
  4342. } else {
  4343. /* defer the work to tcp_release_cb() */
  4344. set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
  4345. +
  4346. + /* For MPTCP, we set the tsq-bit on the meta, and the
  4347. + * subflow as we don't know if the limitation happened
  4348. + * while inside mptcp_write_xmit or during tcp_write_xmit.
  4349. + */
  4350. + if (tp->mpc) {
  4351. + set_bit(TCP_TSQ_DEFERRED, &tcp_sk(meta_sk)->tsq_flags);
  4352. + mptcp_tsq_flags(sk);
  4353. + }
  4354. }
  4355. - bh_unlock_sock(sk);
  4356. + bh_unlock_sock(meta_sk);
  4357. clear_bit(TSQ_QUEUED, &tp->tsq_flags);
  4358. sk_free(sk);
  4359. @@ -743,7 +770,10 @@ static void tcp_tasklet_func(unsigned long data)
  4360. #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
  4361. (1UL << TCP_WRITE_TIMER_DEFERRED) | \
  4362. (1UL << TCP_DELACK_TIMER_DEFERRED) | \
  4363. - (1UL << TCP_MTU_REDUCED_DEFERRED))
  4364. + (1UL << TCP_MTU_REDUCED_DEFERRED) | \
  4365. + (1UL << MPTCP_PATH_MANAGER) | \
  4366. + (1UL << MPTCP_SUB_DEFERRED))
  4367. +
  4368. /**
  4369. * tcp_release_cb - tcp release_sock() callback
  4370. * @sk: socket
  4371. @@ -790,6 +820,13 @@ void tcp_release_cb(struct sock *sk)
  4372. sk->sk_prot->mtu_reduced(sk);
  4373. __sock_put(sk);
  4374. }
  4375. + if (flags & (1UL << MPTCP_PATH_MANAGER)) {
  4376. + if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
  4377. + tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
  4378. + __sock_put(sk);
  4379. + }
  4380. + if (flags & (1UL << MPTCP_SUB_DEFERRED))
  4381. + mptcp_tsq_sub_deferred(sk);
  4382. }
  4383. EXPORT_SYMBOL(tcp_release_cb);
  4384. @@ -849,8 +886,8 @@ void tcp_wfree(struct sk_buff *skb)
  4385. * We are working here with either a clone of the original
  4386. * SKB, or a fresh unique copy made by the retransmit engine.
  4387. */
  4388. -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  4389. - gfp_t gfp_mask)
  4390. +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  4391. + gfp_t gfp_mask)
  4392. {
  4393. const struct inet_connection_sock *icsk = inet_csk(sk);
  4394. struct inet_sock *inet;
  4395. @@ -878,10 +915,28 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  4396. NET_INC_STATS(sock_net(sk),
  4397. LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
  4398. - if (unlikely(skb_cloned(skb)))
  4399. - skb = pskb_copy(skb, gfp_mask);
  4400. - else
  4401. + if (unlikely(skb_cloned(skb))) {
  4402. + struct sk_buff *newskb;
  4403. + if (mptcp_is_data_seq(skb))
  4404. + skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN +
  4405. + MPTCP_SUB_LEN_ACK_ALIGN +
  4406. + MPTCP_SUB_LEN_SEQ_ALIGN);
  4407. +
  4408. + newskb = pskb_copy(skb, gfp_mask);
  4409. +
  4410. + if (mptcp_is_data_seq(skb)) {
  4411. + skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN +
  4412. + MPTCP_SUB_LEN_ACK_ALIGN +
  4413. + MPTCP_SUB_LEN_SEQ_ALIGN);
  4414. + if (newskb)
  4415. + skb_pull(newskb, MPTCP_SUB_LEN_DSS_ALIGN +
  4416. + MPTCP_SUB_LEN_ACK_ALIGN +
  4417. + MPTCP_SUB_LEN_SEQ_ALIGN);
  4418. + }
  4419. + skb = newskb;
  4420. + } else {
  4421. skb = skb_clone(skb, gfp_mask);
  4422. + }
  4423. if (unlikely(!skb))
  4424. return -ENOBUFS;
  4425. }
  4426. @@ -929,7 +984,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  4427. */
  4428. th->window = htons(min(tp->rcv_wnd, 65535U));
  4429. } else {
  4430. - th->window = htons(tcp_select_window(sk));
  4431. + th->window = htons(tp->select_window(sk));
  4432. }
  4433. th->check = 0;
  4434. th->urg_ptr = 0;
  4435. @@ -945,7 +1000,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  4436. }
  4437. }
  4438. - tcp_options_write((__be32 *)(th + 1), tp, &opts);
  4439. + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
  4440. if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
  4441. TCP_ECN_send(sk, skb, tcp_header_size);
  4442. @@ -984,7 +1039,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  4443. * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
  4444. * otherwise socket can stall.
  4445. */
  4446. -static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  4447. +void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  4448. {
  4449. struct tcp_sock *tp = tcp_sk(sk);
  4450. @@ -997,15 +1052,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  4451. }
  4452. /* Initialize TSO segments for a packet. */
  4453. -static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
  4454. - unsigned int mss_now)
  4455. +void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
  4456. + unsigned int mss_now)
  4457. {
  4458. struct skb_shared_info *shinfo = skb_shinfo(skb);
  4459. /* Make sure we own this skb before messing gso_size/gso_segs */
  4460. WARN_ON_ONCE(skb_cloned(skb));
  4461. - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
  4462. + if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||
  4463. + (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {
  4464. /* Avoid the costly divide in the normal
  4465. * non-TSO case.
  4466. */
  4467. @@ -1037,7 +1093,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
  4468. /* Pcount in the middle of the write queue got changed, we need to do various
  4469. * tweaks to fix counters
  4470. */
  4471. -static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
  4472. +void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
  4473. {
  4474. struct tcp_sock *tp = tcp_sk(sk);
  4475. @@ -1078,6 +1134,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  4476. int nlen;
  4477. u8 flags;
  4478. + if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb))
  4479. + mptcp_fragment(sk, skb, len, mss_now, 0);
  4480. +
  4481. if (WARN_ON(len > skb->len))
  4482. return -EINVAL;
  4483. @@ -1160,7 +1219,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  4484. * eventually). The difference is that pulled data not copied, but
  4485. * immediately discarded.
  4486. */
  4487. -static void __pskb_trim_head(struct sk_buff *skb, int len)
  4488. +void __pskb_trim_head(struct sk_buff *skb, int len)
  4489. {
  4490. struct skb_shared_info *shinfo;
  4491. int i, k, eat;
  4492. @@ -1201,6 +1260,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
  4493. /* Remove acked data from a packet in the transmit queue. */
  4494. int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  4495. {
  4496. + if (tcp_sk(sk)->mpc && !is_meta_sk(sk) && mptcp_is_data_seq(skb))
  4497. + return mptcp_trim_head(sk, skb, len);
  4498. +
  4499. if (skb_unclone(skb, GFP_ATOMIC))
  4500. return -ENOMEM;
  4501. @@ -1218,6 +1280,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  4502. if (tcp_skb_pcount(skb) > 1)
  4503. tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
  4504. +#ifdef CONFIG_MPTCP
  4505. + /* Some data got acked - we assume that the seq-number reached the dest.
  4506. + * Anyway, our MPTCP-option has been trimmed above - we lost it here.
  4507. + * Only remove the SEQ if the call does not come from a meta retransmit.
  4508. + */
  4509. + if (tcp_sk(sk)->mpc && !is_meta_sk(sk))
  4510. + TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;
  4511. +#endif
  4512. +
  4513. return 0;
  4514. }
  4515. @@ -1377,7 +1448,7 @@ unsigned int tcp_current_mss(struct sock *sk)
  4516. }
  4517. /* Congestion window validation. (RFC2861) */
  4518. -static void tcp_cwnd_validate(struct sock *sk)
  4519. +void tcp_cwnd_validate(struct sock *sk)
  4520. {
  4521. struct tcp_sock *tp = tcp_sk(sk);
  4522. @@ -1411,8 +1482,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
  4523. * But we can avoid doing the divide again given we already have
  4524. * skb_pcount = skb->len / mss_now
  4525. */
  4526. -static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
  4527. - const struct sk_buff *skb)
  4528. +void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
  4529. + const struct sk_buff *skb)
  4530. {
  4531. if (skb->len < tcp_skb_pcount(skb) * mss_now)
  4532. tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
  4533. @@ -1433,19 +1504,28 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
  4534. (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
  4535. }
  4536. /* Returns the portion of skb which can be sent right away */
  4537. -static unsigned int tcp_mss_split_point(const struct sock *sk,
  4538. - const struct sk_buff *skb,
  4539. - unsigned int mss_now,
  4540. - unsigned int max_segs,
  4541. - int nonagle)
  4542. +unsigned int tcp_mss_split_point(const struct sock *sk,
  4543. + const struct sk_buff *skb,
  4544. + unsigned int mss_now,
  4545. + unsigned int max_segs,
  4546. + int nonagle)
  4547. {
  4548. const struct tcp_sock *tp = tcp_sk(sk);
  4549. + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  4550. u32 partial, needed, window, max_len;
  4551. - window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
  4552. + if (!tp->mpc)
  4553. + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
  4554. + else
  4555. + /* We need to evaluate the available space in the sending window
  4556. + * at the subflow level. However, the subflow seq has not yet
  4557. + * been set. Nevertheless we know that the caller will set it to
  4558. + * write_seq.
  4559. + */
  4560. + window = tcp_wnd_end(tp) - tp->write_seq;
  4561. max_len = mss_now * max_segs;
  4562. - if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
  4563. + if (likely(max_len <= window && skb != tcp_write_queue_tail(meta_sk)))
  4564. return max_len;
  4565. needed = min(skb->len, window);
  4566. @@ -1467,13 +1547,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
  4567. /* Can at least one segment of SKB be sent right now, according to the
  4568. * congestion window rules? If so, return how many segments are allowed.
  4569. */
  4570. -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  4571. - const struct sk_buff *skb)
  4572. +unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  4573. + const struct sk_buff *skb)
  4574. {
  4575. u32 in_flight, cwnd;
  4576. /* Don't be strict about the congestion window for the final FIN. */
  4577. - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
  4578. + if (skb &&
  4579. + ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb)) &&
  4580. tcp_skb_pcount(skb) == 1)
  4581. return 1;
  4582. @@ -1489,8 +1570,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  4583. * This must be invoked the first time we consider transmitting
  4584. * SKB onto the wire.
  4585. */
  4586. -static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
  4587. - unsigned int mss_now)
  4588. +int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
  4589. + unsigned int mss_now)
  4590. {
  4591. int tso_segs = tcp_skb_pcount(skb);
  4592. @@ -1505,8 +1586,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
  4593. /* Return true if the Nagle test allows this packet to be
  4594. * sent now.
  4595. */
  4596. -static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  4597. - unsigned int cur_mss, int nonagle)
  4598. +bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  4599. + unsigned int cur_mss, int nonagle)
  4600. {
  4601. /* Nagle rule does not apply to frames, which sit in the middle of the
  4602. * write_queue (they have no chances to get new data).
  4603. @@ -1518,7 +1599,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
  4604. return true;
  4605. /* Don't use the nagle rule for urgent data (or for the final FIN). */
  4606. - if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
  4607. + if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
  4608. + mptcp_is_data_fin(skb))
  4609. return true;
  4610. if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
  4611. @@ -1528,9 +1610,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
  4612. }
  4613. /* Does at least the first segment of SKB fit into the send window? */
  4614. -static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  4615. - const struct sk_buff *skb,
  4616. - unsigned int cur_mss)
  4617. +bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  4618. + unsigned int cur_mss)
  4619. {
  4620. u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  4621. @@ -1549,14 +1630,16 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
  4622. {
  4623. const struct tcp_sock *tp = tcp_sk(sk);
  4624. unsigned int cwnd_quota;
  4625. + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  4626. + const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  4627. - tcp_init_tso_segs(sk, skb, cur_mss);
  4628. + tcp_init_tso_segs(meta_sk, skb, cur_mss);
  4629. - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
  4630. + if (!tcp_nagle_test(meta_tp, skb, cur_mss, nonagle))
  4631. return 0;
  4632. cwnd_quota = tcp_cwnd_test(tp, skb);
  4633. - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
  4634. + if (cwnd_quota && !tcp_snd_wnd_test(meta_tp, skb, cur_mss))
  4635. cwnd_quota = 0;
  4636. return cwnd_quota;
  4637. @@ -1566,12 +1649,16 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
  4638. bool tcp_may_send_now(struct sock *sk)
  4639. {
  4640. const struct tcp_sock *tp = tcp_sk(sk);
  4641. - struct sk_buff *skb = tcp_send_head(sk);
  4642. + struct sk_buff *skb;
  4643. + const struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  4644. + const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  4645. +
  4646. + skb = tcp_send_head(meta_sk);
  4647. return skb &&
  4648. tcp_snd_test(sk, skb, tcp_current_mss(sk),
  4649. - (tcp_skb_is_last(sk, skb) ?
  4650. - tp->nonagle : TCP_NAGLE_PUSH));
  4651. + (tcp_skb_is_last(meta_sk, skb) ?
  4652. + meta_tp->nonagle : TCP_NAGLE_PUSH));
  4653. }
  4654. /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
  4655. @@ -1588,6 +1675,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  4656. int nlen = skb->len - len;
  4657. u8 flags;
  4658. + if (tcp_sk(sk)->mpc && mptcp_is_data_seq(skb))
  4659. + mptso_fragment(sk, skb, len, mss_now, gfp, 0);
  4660. +
  4661. /* All of a TSO frame must be composed of paged data. */
  4662. if (skb->len != skb->data_len)
  4663. return tcp_fragment(sk, skb, len, mss_now);
  4664. @@ -1633,29 +1723,39 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  4665. *
  4666. * This algorithm is from John Heffner.
  4667. */
  4668. -static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
  4669. +bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
  4670. {
  4671. struct tcp_sock *tp = tcp_sk(sk);
  4672. + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  4673. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  4674. const struct inet_connection_sock *icsk = inet_csk(sk);
  4675. u32 send_win, cong_win, limit, in_flight;
  4676. int win_divisor;
  4677. - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
  4678. + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
  4679. goto send_now;
  4680. if (icsk->icsk_ca_state != TCP_CA_Open)
  4681. goto send_now;
  4682. /* Defer for less than two clock ticks. */
  4683. - if (tp->tso_deferred &&
  4684. - (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
  4685. + if (meta_tp->tso_deferred &&
  4686. + (((u32)jiffies << 1) >> 1) - (meta_tp->tso_deferred >> 1) > 1)
  4687. goto send_now;
  4688. in_flight = tcp_packets_in_flight(tp);
  4689. BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
  4690. - send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
  4691. + if (!tp->mpc)
  4692. + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
  4693. + else
  4694. + /* We need to evaluate the available space in the sending window
  4695. + * at the subflow level. However, the subflow seq has not yet
  4696. + * been set. Nevertheless we know that the caller will set it to
  4697. + * write_seq.
  4698. + */
  4699. + send_win = tcp_wnd_end(tp) - tp->write_seq;
  4700. /* From in_flight test above, we know that cwnd > in_flight. */
  4701. cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
  4702. @@ -1668,7 +1768,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
  4703. goto send_now;
  4704. /* Middle in queue won't get any more data, full sendable already? */
  4705. - if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
  4706. + if ((skb != tcp_write_queue_tail(meta_sk)) && (limit >= skb->len))
  4707. goto send_now;
  4708. win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
  4709. @@ -1694,13 +1794,13 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
  4710. /* Ok, it looks like it is advisable to defer.
  4711. * Do not rearm the timer if already set to not break TCP ACK clocking.
  4712. */
  4713. - if (!tp->tso_deferred)
  4714. - tp->tso_deferred = 1 | (jiffies << 1);
  4715. + if (!meta_tp->tso_deferred)
  4716. + meta_tp->tso_deferred = 1 | (jiffies << 1);
  4717. return true;
  4718. send_now:
  4719. - tp->tso_deferred = 0;
  4720. + meta_tp->tso_deferred = 0;
  4721. return false;
  4722. }
  4723. @@ -1713,7 +1813,7 @@ send_now:
  4724. * 1 if a probe was sent,
  4725. * -1 otherwise
  4726. */
  4727. -static int tcp_mtu_probe(struct sock *sk)
  4728. +int tcp_mtu_probe(struct sock *sk)
  4729. {
  4730. struct tcp_sock *tp = tcp_sk(sk);
  4731. struct inet_connection_sock *icsk = inet_csk(sk);
  4732. @@ -1858,6 +1958,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  4733. int cwnd_quota;
  4734. int result;
  4735. + if (is_meta_sk(sk))
  4736. + return mptcp_write_xmit(sk, mss_now, nonagle, push_one, gfp);
  4737. +
  4738. sent_pkts = 0;
  4739. if (!push_one) {
  4740. @@ -2313,6 +2416,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
  4741. if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
  4742. return;
  4743. + /* Currently not supported for MPTCP - but it should be possible */
  4744. + if (tp->mpc)
  4745. + return;
  4746. +
  4747. tcp_for_write_queue_from_safe(skb, tmp, sk) {
  4748. if (!tcp_can_collapse(sk, skb))
  4749. break;
  4750. @@ -2410,10 +2517,26 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  4751. */
  4752. if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
  4753. skb_headroom(skb) >= 0xFFFF)) {
  4754. - struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
  4755. - GFP_ATOMIC);
  4756. + struct sk_buff *nskb;
  4757. +
  4758. + if (mptcp_is_data_seq(skb))
  4759. + skb_push(skb, MPTCP_SUB_LEN_DSS_ALIGN +
  4760. + MPTCP_SUB_LEN_ACK_ALIGN +
  4761. + MPTCP_SUB_LEN_SEQ_ALIGN);
  4762. +
  4763. + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
  4764. +
  4765. + if (mptcp_is_data_seq(skb)) {
  4766. + skb_pull(skb, MPTCP_SUB_LEN_DSS_ALIGN +
  4767. + MPTCP_SUB_LEN_ACK_ALIGN +
  4768. + MPTCP_SUB_LEN_SEQ_ALIGN);
  4769. + if (nskb)
  4770. + skb_pull(nskb, MPTCP_SUB_LEN_DSS_ALIGN +
  4771. + MPTCP_SUB_LEN_ACK_ALIGN +
  4772. + MPTCP_SUB_LEN_SEQ_ALIGN);
  4773. + }
  4774. err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
  4775. - -ENOBUFS;
  4776. + -ENOBUFS;
  4777. } else {
  4778. err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
  4779. }
  4780. @@ -2640,6 +2763,11 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
  4781. {
  4782. struct sk_buff *skb;
  4783. + if (is_meta_sk(sk)) {
  4784. + mptcp_send_active_reset(sk, priority);
  4785. + return;
  4786. + }
  4787. +
  4788. /* NOTE: No TCP options attached and we never retransmit this. */
  4789. skb = alloc_skb(MAX_TCP_HEADER, priority);
  4790. if (!skb) {
  4791. @@ -2742,14 +2870,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
  4792. (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
  4793. req->window_clamp = tcp_full_space(sk);
  4794. - /* tcp_full_space because it is guaranteed to be the first packet */
  4795. - tcp_select_initial_window(tcp_full_space(sk),
  4796. - mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
  4797. + tp->select_initial_window(tcp_full_space(sk),
  4798. + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
  4799. + (tcp_rsk(req)->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
  4800. &req->rcv_wnd,
  4801. &req->window_clamp,
  4802. ireq->wscale_ok,
  4803. &rcv_wscale,
  4804. - dst_metric(dst, RTAX_INITRWND));
  4805. + dst_metric(dst, RTAX_INITRWND), sk);
  4806. ireq->rcv_wscale = rcv_wscale;
  4807. }
  4808. @@ -2785,7 +2913,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
  4809. /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
  4810. th->window = htons(min(req->rcv_wnd, 65535U));
  4811. - tcp_options_write((__be32 *)(th + 1), tp, &opts);
  4812. + tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
  4813. th->doff = (tcp_header_size >> 2);
  4814. TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
  4815. @@ -2839,13 +2967,13 @@ static void tcp_connect_init(struct sock *sk)
  4816. (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
  4817. tp->window_clamp = tcp_full_space(sk);
  4818. - tcp_select_initial_window(tcp_full_space(sk),
  4819. + tp->select_initial_window(tcp_full_space(sk),
  4820. tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
  4821. &tp->rcv_wnd,
  4822. &tp->window_clamp,
  4823. sysctl_tcp_window_scaling,
  4824. &rcv_wscale,
  4825. - dst_metric(dst, RTAX_INITRWND));
  4826. + dst_metric(dst, RTAX_INITRWND), sk);
  4827. tp->rx_opt.rcv_wscale = rcv_wscale;
  4828. tp->rcv_ssthresh = tp->rcv_wnd;
  4829. @@ -2869,6 +2997,38 @@ static void tcp_connect_init(struct sock *sk)
  4830. inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
  4831. inet_csk(sk)->icsk_retransmits = 0;
  4832. tcp_clear_retrans(tp);
  4833. +
  4834. +#ifdef CONFIG_MPTCP
  4835. + if (sysctl_mptcp_enabled && mptcp_doit(sk)) {
  4836. + if (is_master_tp(tp)) {
  4837. + tp->request_mptcp = 1;
  4838. + mptcp_connect_init(sk);
  4839. + } else if (tp->mptcp) {
  4840. + struct inet_sock *inet = inet_sk(sk);
  4841. +
  4842. + tp->mptcp->snt_isn = tp->write_seq;
  4843. + tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
  4844. +
  4845. + /* Set nonce for new subflows */
  4846. + if (sk->sk_family == AF_INET)
  4847. + tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
  4848. + inet->inet_saddr,
  4849. + inet->inet_daddr,
  4850. + inet->inet_sport,
  4851. + inet->inet_dport,
  4852. + tp->write_seq);
  4853. +#if IS_ENABLED(CONFIG_IPV6)
  4854. + else
  4855. + tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
  4856. + inet6_sk(sk)->saddr.s6_addr32,
  4857. + sk->sk_v6_daddr.s6_addr32,
  4858. + inet->inet_sport,
  4859. + inet->inet_dport,
  4860. + tp->write_seq);
  4861. +#endif
  4862. + }
  4863. + }
  4864. +#endif
  4865. }
  4866. static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
  4867. @@ -3111,6 +3271,7 @@ void tcp_send_ack(struct sock *sk)
  4868. TCP_SKB_CB(buff)->when = tcp_time_stamp;
  4869. tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
  4870. }
  4871. +EXPORT_SYMBOL(tcp_send_ack);
  4872. /* This routine sends a packet with an out of date sequence
  4873. * number. It assumes the other end will try to ack it.
  4874. @@ -3123,7 +3284,7 @@ void tcp_send_ack(struct sock *sk)
  4875. * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
  4876. * out-of-date with SND.UNA-1 to probe window.
  4877. */
  4878. -static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
  4879. +int tcp_xmit_probe_skb(struct sock *sk, int urgent)
  4880. {
  4881. struct tcp_sock *tp = tcp_sk(sk);
  4882. struct sk_buff *skb;
  4883. @@ -3161,6 +3322,9 @@ int tcp_write_wakeup(struct sock *sk)
  4884. if (sk->sk_state == TCP_CLOSE)
  4885. return -1;
  4886. + if (is_meta_sk(sk))
  4887. + return mptcp_write_wakeup(sk);
  4888. +
  4889. if ((skb = tcp_send_head(sk)) != NULL &&
  4890. before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
  4891. int err;
  4892. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
  4893. index 64f0354..7b55b9a 100644
  4894. --- a/net/ipv4/tcp_timer.c
  4895. +++ b/net/ipv4/tcp_timer.c
  4896. @@ -20,6 +20,7 @@
  4897. #include <linux/module.h>
  4898. #include <linux/gfp.h>
  4899. +#include <net/mptcp.h>
  4900. #include <net/tcp.h>
  4901. int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
  4902. @@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
  4903. int sysctl_tcp_orphan_retries __read_mostly;
  4904. int sysctl_tcp_thin_linear_timeouts __read_mostly;
  4905. -static void tcp_write_err(struct sock *sk)
  4906. +void tcp_write_err(struct sock *sk)
  4907. {
  4908. sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  4909. sk->sk_error_report(sk);
  4910. @@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
  4911. * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
  4912. * syn_set flag is set.
  4913. */
  4914. -static bool retransmits_timed_out(struct sock *sk,
  4915. - unsigned int boundary,
  4916. - unsigned int timeout,
  4917. - bool syn_set)
  4918. +bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
  4919. + unsigned int timeout, bool syn_set)
  4920. {
  4921. unsigned int linear_backoff_thresh, start_ts;
  4922. unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
  4923. @@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk,
  4924. }
  4925. /* A write timeout has occurred. Process the after effects. */
  4926. -static int tcp_write_timeout(struct sock *sk)
  4927. +int tcp_write_timeout(struct sock *sk)
  4928. {
  4929. struct inet_connection_sock *icsk = inet_csk(sk);
  4930. struct tcp_sock *tp = tcp_sk(sk);
  4931. @@ -168,6 +167,10 @@ static int tcp_write_timeout(struct sock *sk)
  4932. }
  4933. retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
  4934. syn_set = true;
  4935. + /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
  4936. + if (tcp_sk(sk)->request_mptcp &&
  4937. + icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())
  4938. + tcp_sk(sk)->request_mptcp = 0;
  4939. } else {
  4940. if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
  4941. /* Black hole detection */
  4942. @@ -248,18 +251,22 @@ out:
  4943. static void tcp_delack_timer(unsigned long data)
  4944. {
  4945. struct sock *sk = (struct sock *)data;
  4946. + struct tcp_sock *tp = tcp_sk(sk);
  4947. + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  4948. - bh_lock_sock(sk);
  4949. - if (!sock_owned_by_user(sk)) {
  4950. + bh_lock_sock(meta_sk);
  4951. + if (!sock_owned_by_user(meta_sk)) {
  4952. tcp_delack_timer_handler(sk);
  4953. } else {
  4954. inet_csk(sk)->icsk_ack.blocked = 1;
  4955. - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
  4956. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
  4957. /* deleguate our work to tcp_release_cb() */
  4958. if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
  4959. sock_hold(sk);
  4960. + if (tp->mpc)
  4961. + mptcp_tsq_flags(sk);
  4962. }
  4963. - bh_unlock_sock(sk);
  4964. + bh_unlock_sock(meta_sk);
  4965. sock_put(sk);
  4966. }
  4967. @@ -421,6 +428,9 @@ void tcp_retransmit_timer(struct sock *sk)
  4968. tcp_enter_loss(sk, 0);
  4969. + if (tp->mpc)
  4970. + mptcp_reinject_data(sk, 1);
  4971. +
  4972. if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
  4973. /* Retransmission failed because of local congestion,
  4974. * do not backoff.
  4975. @@ -471,6 +481,8 @@ out_reset_timer:
  4976. /* Use normal (exponential) backoff */
  4977. icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
  4978. }
  4979. + if (tp->mpc)
  4980. + mptcp_set_rto(sk);
  4981. inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
  4982. if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
  4983. __sk_dst_reset(sk);
  4984. @@ -502,7 +514,10 @@ void tcp_write_timer_handler(struct sock *sk)
  4985. break;
  4986. case ICSK_TIME_RETRANS:
  4987. icsk->icsk_pending = 0;
  4988. - tcp_retransmit_timer(sk);
  4989. + if (is_meta_sk(sk))
  4990. + mptcp_retransmit_timer(sk);
  4991. + else
  4992. + tcp_retransmit_timer(sk);
  4993. break;
  4994. case ICSK_TIME_PROBE0:
  4995. icsk->icsk_pending = 0;
  4996. @@ -517,16 +532,19 @@ out:
  4997. static void tcp_write_timer(unsigned long data)
  4998. {
  4999. struct sock *sk = (struct sock *)data;
  5000. + struct sock *meta_sk = tcp_sk(sk)->mpc ? mptcp_meta_sk(sk) : sk;
  5001. - bh_lock_sock(sk);
  5002. - if (!sock_owned_by_user(sk)) {
  5003. + bh_lock_sock(meta_sk);
  5004. + if (!sock_owned_by_user(meta_sk)) {
  5005. tcp_write_timer_handler(sk);
  5006. } else {
  5007. /* deleguate our work to tcp_release_cb() */
  5008. if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
  5009. sock_hold(sk);
  5010. + if (tcp_sk(sk)->mpc)
  5011. + mptcp_tsq_flags(sk);
  5012. }
  5013. - bh_unlock_sock(sk);
  5014. + bh_unlock_sock(meta_sk);
  5015. sock_put(sk);
  5016. }
  5017. @@ -563,11 +581,12 @@ static void tcp_keepalive_timer (unsigned long data)
  5018. struct sock *sk = (struct sock *) data;
  5019. struct inet_connection_sock *icsk = inet_csk(sk);
  5020. struct tcp_sock *tp = tcp_sk(sk);
  5021. + struct sock *meta_sk = tp->mpc ? mptcp_meta_sk(sk) : sk;
  5022. u32 elapsed;
  5023. /* Only process if socket is not in use. */
  5024. - bh_lock_sock(sk);
  5025. - if (sock_owned_by_user(sk)) {
  5026. + bh_lock_sock(meta_sk);
  5027. + if (sock_owned_by_user(meta_sk)) {
  5028. /* Try again later. */
  5029. inet_csk_reset_keepalive_timer (sk, HZ/20);
  5030. goto out;
  5031. @@ -578,6 +597,29 @@ static void tcp_keepalive_timer (unsigned long data)
  5032. goto out;
  5033. }
  5034. + if (tp->send_mp_fclose) {
  5035. + /* MUST do this before tcp_write_timeout, because retrans_stamp
  5036. + * may have been set to 0 in another part while we are
  5037. + * retransmitting MP_FASTCLOSE. Then, we would crash, because
  5038. + * retransmits_timed_out accesses the meta-write-queue.
  5039. + *
  5040. + * We make sure that the timestamp is != 0.
  5041. + */
  5042. + if (!tp->retrans_stamp)
  5043. + tp->retrans_stamp = tcp_time_stamp ? : 1;
  5044. +
  5045. + if (tcp_write_timeout(sk))
  5046. + goto out;
  5047. +
  5048. + tcp_send_ack(sk);
  5049. + icsk->icsk_backoff++;
  5050. + icsk->icsk_retransmits++;
  5051. +
  5052. + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
  5053. + elapsed = icsk->icsk_rto;
  5054. + goto resched;
  5055. + }
  5056. +
  5057. if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
  5058. if (tp->linger2 >= 0) {
  5059. const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
  5060. @@ -639,7 +681,7 @@ death:
  5061. tcp_done(sk);
  5062. out:
  5063. - bh_unlock_sock(sk);
  5064. + bh_unlock_sock(meta_sk);
  5065. sock_put(sk);
  5066. }
  5067. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
  5068. index 6c7fa08..733d602 100644
  5069. --- a/net/ipv6/addrconf.c
  5070. +++ b/net/ipv6/addrconf.c
  5071. @@ -765,6 +765,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
  5072. kfree_rcu(ifp, rcu);
  5073. }
  5074. +EXPORT_SYMBOL(inet6_ifa_finish_destroy);
  5075. static void
  5076. ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
  5077. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
  5078. index d935889..9f0fd80 100644
  5079. --- a/net/ipv6/af_inet6.c
  5080. +++ b/net/ipv6/af_inet6.c
  5081. @@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
  5082. return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
  5083. }
  5084. -static int inet6_create(struct net *net, struct socket *sock, int protocol,
  5085. - int kern)
  5086. +int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
  5087. {
  5088. struct inet_sock *inet;
  5089. struct ipv6_pinfo *np;
  5090. diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
  5091. index c913818..2f5b4c5 100644
  5092. --- a/net/ipv6/inet6_connection_sock.c
  5093. +++ b/net/ipv6/inet6_connection_sock.c
  5094. @@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
  5095. /*
  5096. * request_sock (formerly open request) hash tables.
  5097. */
  5098. -static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
  5099. - const u32 rnd, const u32 synq_hsize)
  5100. +u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
  5101. + const u32 rnd, const u32 synq_hsize)
  5102. {
  5103. u32 c;
  5104. diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
  5105. index bb53a5e7..0d29995 100644
  5106. --- a/net/ipv6/syncookies.c
  5107. +++ b/net/ipv6/syncookies.c
  5108. @@ -181,7 +181,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
  5109. /* check for timestamp cookie support */
  5110. memset(&tcp_opt, 0, sizeof(tcp_opt));
  5111. - tcp_parse_options(skb, &tcp_opt, 0, NULL);
  5112. + tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
  5113. if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
  5114. goto out;
  5115. @@ -253,10 +253,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
  5116. }
  5117. req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
  5118. - tcp_select_initial_window(tcp_full_space(sk), req->mss,
  5119. + tp->select_initial_window(tcp_full_space(sk), req->mss,
  5120. &req->rcv_wnd, &req->window_clamp,
  5121. ireq->wscale_ok, &rcv_wscale,
  5122. - dst_metric(dst, RTAX_INITRWND));
  5123. + dst_metric(dst, RTAX_INITRWND), sk);
  5124. ireq->rcv_wscale = rcv_wscale;
  5125. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
  5126. index 889079b..d7f8b5f 100644
  5127. --- a/net/ipv6/tcp_ipv6.c
  5128. +++ b/net/ipv6/tcp_ipv6.c
  5129. @@ -63,6 +63,8 @@
  5130. #include <net/inet_common.h>
  5131. #include <net/secure_seq.h>
  5132. #include <net/tcp_memcontrol.h>
  5133. +#include <net/mptcp.h>
  5134. +#include <net/mptcp_v6.h>
  5135. #include <net/busy_poll.h>
  5136. #include <asm/uaccess.h>
  5137. @@ -73,14 +75,6 @@
  5138. #include <linux/crypto.h>
  5139. #include <linux/scatterlist.h>
  5140. -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
  5141. -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  5142. - struct request_sock *req);
  5143. -
  5144. -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
  5145. -
  5146. -static const struct inet_connection_sock_af_ops ipv6_mapped;
  5147. -static const struct inet_connection_sock_af_ops ipv6_specific;
  5148. #ifdef CONFIG_TCP_MD5SIG
  5149. static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
  5150. static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
  5151. @@ -92,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
  5152. }
  5153. #endif
  5154. -static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
  5155. +void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
  5156. {
  5157. struct dst_entry *dst = skb_dst(skb);
  5158. const struct rt6_info *rt = (const struct rt6_info *)dst;
  5159. @@ -104,7 +98,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
  5160. inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
  5161. }
  5162. -static void tcp_v6_hash(struct sock *sk)
  5163. +void tcp_v6_hash(struct sock *sk)
  5164. {
  5165. if (sk->sk_state != TCP_CLOSE) {
  5166. if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
  5167. @@ -117,7 +111,7 @@ static void tcp_v6_hash(struct sock *sk)
  5168. }
  5169. }
  5170. -static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
  5171. +__u32 tcp_v6_init_sequence(const struct sk_buff *skb)
  5172. {
  5173. return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
  5174. ipv6_hdr(skb)->saddr.s6_addr32,
  5175. @@ -125,7 +119,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
  5176. tcp_hdr(skb)->source);
  5177. }
  5178. -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
  5179. +int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
  5180. int addr_len)
  5181. {
  5182. struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
  5183. @@ -339,7 +333,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5184. const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
  5185. const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
  5186. struct ipv6_pinfo *np;
  5187. - struct sock *sk;
  5188. + struct sock *sk, *meta_sk;
  5189. int err;
  5190. struct tcp_sock *tp;
  5191. __u32 seq;
  5192. @@ -359,8 +353,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5193. return;
  5194. }
  5195. - bh_lock_sock(sk);
  5196. - if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
  5197. + tp = tcp_sk(sk);
  5198. + if (tp->mpc)
  5199. + meta_sk = mptcp_meta_sk(sk);
  5200. + else
  5201. + meta_sk = sk;
  5202. +
  5203. + bh_lock_sock(meta_sk);
  5204. + if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
  5205. NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
  5206. if (sk->sk_state == TCP_CLOSE)
  5207. @@ -371,7 +371,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5208. goto out;
  5209. }
  5210. - tp = tcp_sk(sk);
  5211. seq = ntohl(th->seq);
  5212. if (sk->sk_state != TCP_LISTEN &&
  5213. !between(seq, tp->snd_una, tp->snd_nxt)) {
  5214. @@ -401,11 +400,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5215. goto out;
  5216. tp->mtu_info = ntohl(info);
  5217. - if (!sock_owned_by_user(sk))
  5218. + if (!sock_owned_by_user(meta_sk))
  5219. tcp_v6_mtu_reduced(sk);
  5220. - else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
  5221. + else {
  5222. + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
  5223. &tp->tsq_flags))
  5224. - sock_hold(sk);
  5225. + sock_hold(sk);
  5226. + if (tp->mpc)
  5227. + mptcp_tsq_flags(sk);
  5228. + }
  5229. goto out;
  5230. }
  5231. @@ -415,7 +418,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5232. switch (sk->sk_state) {
  5233. struct request_sock *req, **prev;
  5234. case TCP_LISTEN:
  5235. - if (sock_owned_by_user(sk))
  5236. + if (sock_owned_by_user(meta_sk))
  5237. goto out;
  5238. req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
  5239. @@ -440,7 +443,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5240. case TCP_SYN_SENT:
  5241. case TCP_SYN_RECV: /* Cannot happen.
  5242. It can, it SYNs are crossed. --ANK */
  5243. - if (!sock_owned_by_user(sk)) {
  5244. + if (!sock_owned_by_user(meta_sk)) {
  5245. sk->sk_err = err;
  5246. sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
  5247. @@ -450,22 +453,22 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
  5248. goto out;
  5249. }
  5250. - if (!sock_owned_by_user(sk) && np->recverr) {
  5251. + if (!sock_owned_by_user(meta_sk) && np->recverr) {
  5252. sk->sk_err = err;
  5253. sk->sk_error_report(sk);
  5254. } else
  5255. sk->sk_err_soft = err;
  5256. out:
  5257. - bh_unlock_sock(sk);
  5258. + bh_unlock_sock(meta_sk);
  5259. sock_put(sk);
  5260. }
  5261. -static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
  5262. - struct flowi6 *fl6,
  5263. - struct request_sock *req,
  5264. - u16 queue_mapping)
  5265. +int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
  5266. + struct flowi6 *fl6,
  5267. + struct request_sock *req,
  5268. + u16 queue_mapping)
  5269. {
  5270. struct inet_request_sock *ireq = inet_rsk(req);
  5271. struct ipv6_pinfo *np = inet6_sk(sk);
  5272. @@ -495,7 +498,7 @@ done:
  5273. return err;
  5274. }
  5275. -static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
  5276. +int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
  5277. {
  5278. struct flowi6 fl6;
  5279. int res;
  5280. @@ -506,7 +509,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
  5281. return res;
  5282. }
  5283. -static void tcp_v6_reqsk_destructor(struct request_sock *req)
  5284. +void tcp_v6_reqsk_destructor(struct request_sock *req)
  5285. {
  5286. kfree_skb(inet_rsk(req)->pktopts);
  5287. }
  5288. @@ -719,16 +722,16 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
  5289. };
  5290. #ifdef CONFIG_TCP_MD5SIG
  5291. -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
  5292. +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
  5293. .md5_lookup = tcp_v6_reqsk_md5_lookup,
  5294. .calc_md5_hash = tcp_v6_md5_hash_skb,
  5295. };
  5296. #endif
  5297. -static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
  5298. - u32 tsval, u32 tsecr,
  5299. +static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,
  5300. + u32 data_ack, u32 win, u32 tsval, u32 tsecr,
  5301. struct tcp_md5sig_key *key, int rst, u8 tclass,
  5302. - u32 label)
  5303. + u32 label, int mptcp)
  5304. {
  5305. const struct tcphdr *th = tcp_hdr(skb);
  5306. struct tcphdr *t1;
  5307. @@ -746,7 +749,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
  5308. if (key)
  5309. tot_len += TCPOLEN_MD5SIG_ALIGNED;
  5310. #endif
  5311. -
  5312. +#ifdef CONFIG_MPTCP
  5313. + if (mptcp)
  5314. + tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
  5315. +#endif
  5316. buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
  5317. GFP_ATOMIC);
  5318. if (buff == NULL)
  5319. @@ -784,6 +790,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
  5320. tcp_v6_md5_hash_hdr((__u8 *)topt, key,
  5321. &ipv6_hdr(skb)->saddr,
  5322. &ipv6_hdr(skb)->daddr, t1);
  5323. + topt += 4;
  5324. + }
  5325. +#endif
  5326. +#ifdef CONFIG_MPTCP
  5327. + if (mptcp) {
  5328. + /* Construction of 32-bit data_ack */
  5329. + *topt++ = htonl((TCPOPT_MPTCP << 24) |
  5330. + ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
  5331. + (0x20 << 8) |
  5332. + (0x01));
  5333. + *topt++ = htonl(data_ack);
  5334. }
  5335. #endif
  5336. @@ -821,7 +838,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
  5337. kfree_skb(buff);
  5338. }
  5339. -static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
  5340. +void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
  5341. {
  5342. const struct tcphdr *th = tcp_hdr(skb);
  5343. u32 seq = 0, ack_seq = 0;
  5344. @@ -876,7 +893,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
  5345. ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
  5346. (th->doff << 2);
  5347. - tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, key, 1, 0, 0);
  5348. + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, key, 1, 0, 0, 0);
  5349. #ifdef CONFIG_TCP_MD5SIG
  5350. release_sk1:
  5351. @@ -887,40 +904,47 @@ release_sk1:
  5352. #endif
  5353. }
  5354. -static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
  5355. +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
  5356. u32 win, u32 tsval, u32 tsecr,
  5357. - struct tcp_md5sig_key *key, u8 tclass,
  5358. - u32 label)
  5359. + struct tcp_md5sig_key *key, u8 tclass, u32 label,
  5360. + int mptcp)
  5361. {
  5362. - tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, key, 0, tclass,
  5363. - label);
  5364. + tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, key, 0,
  5365. + tclass, label, mptcp);
  5366. }
  5367. static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
  5368. {
  5369. struct inet_timewait_sock *tw = inet_twsk(sk);
  5370. struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
  5371. + u32 data_ack = 0;
  5372. + int mptcp = 0;
  5373. + if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
  5374. + data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
  5375. + mptcp = 1;
  5376. + }
  5377. tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
  5378. + data_ack,
  5379. tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
  5380. tcp_time_stamp + tcptw->tw_ts_offset,
  5381. tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw),
  5382. - tw->tw_tclass, (tw->tw_flowlabel << 12));
  5383. + tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp);
  5384. inet_twsk_put(tw);
  5385. }
  5386. -static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  5387. - struct request_sock *req)
  5388. +void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  5389. + struct request_sock *req)
  5390. {
  5391. tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1,
  5392. - req->rcv_wnd, tcp_time_stamp, req->ts_recent,
  5393. + 0, req->rcv_wnd, tcp_time_stamp, req->ts_recent,
  5394. tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
  5395. - 0, 0);
  5396. + 0, 0, 0);
  5397. }
  5398. -static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
  5399. +struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
  5400. {
  5401. struct request_sock *req, **prev;
  5402. const struct tcphdr *th = tcp_hdr(skb);
  5403. @@ -939,7 +963,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
  5404. if (nsk) {
  5405. if (nsk->sk_state != TCP_TIME_WAIT) {
  5406. + /* Don't lock again the meta-sk. It has been locked
  5407. + * before mptcp_v6_do_rcv.
  5408. + */
  5409. + if (tcp_sk(nsk)->mpc && !is_meta_sk(sk))
  5410. + bh_lock_sock(mptcp_meta_sk(nsk));
  5411. bh_lock_sock(nsk);
  5412. +
  5413. return nsk;
  5414. }
  5415. inet_twsk_put(inet_twsk(nsk));
  5416. @@ -959,6 +989,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
  5417. static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
  5418. {
  5419. struct tcp_options_received tmp_opt;
  5420. + struct mptcp_options_received mopt;
  5421. struct request_sock *req;
  5422. struct inet_request_sock *ireq;
  5423. struct ipv6_pinfo *np = inet6_sk(sk);
  5424. @@ -971,6 +1002,23 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
  5425. if (skb->protocol == htons(ETH_P_IP))
  5426. return tcp_v4_conn_request(sk, skb);
  5427. + tcp_clear_options(&tmp_opt);
  5428. + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
  5429. + tmp_opt.user_mss = tp->rx_opt.user_mss;
  5430. + mptcp_init_mp_opt(&mopt);
  5431. + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
  5432. +
  5433. +#ifdef CONFIG_MPTCP
  5434. + /*MPTCP structures not initialized, so return error */
  5435. + if (mptcp_init_failed)
  5436. + mptcp_init_mp_opt(&mopt);
  5437. +
  5438. + if (mopt.is_mp_join)
  5439. + return mptcp_do_join_short(skb, &mopt, &tmp_opt, sock_net(sk));
  5440. + if (mopt.drop_me)
  5441. + goto drop;
  5442. +#endif
  5443. +
  5444. if (!ipv6_unicast_destination(skb))
  5445. goto drop;
  5446. @@ -986,7 +1034,22 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
  5447. goto drop;
  5448. }
  5449. - req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
  5450. +#ifdef CONFIG_MPTCP
  5451. + if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
  5452. + mopt.saw_mpc = 0;
  5453. + if (mopt.saw_mpc && !want_cookie) {
  5454. + req = inet6_reqsk_alloc(&mptcp6_request_sock_ops);
  5455. +
  5456. + if (req == NULL)
  5457. + goto drop;
  5458. +
  5459. + mptcp_rsk(req)->mpcb = NULL;
  5460. + mptcp_rsk(req)->dss_csum = mopt.dss_csum;
  5461. + mptcp_rsk(req)->collide_tk.pprev = NULL;
  5462. + } else
  5463. +#endif
  5464. + req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
  5465. +
  5466. if (req == NULL)
  5467. goto drop;
  5468. @@ -994,17 +1057,15 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
  5469. tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
  5470. #endif
  5471. - tcp_clear_options(&tmp_opt);
  5472. - tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
  5473. - tmp_opt.user_mss = tp->rx_opt.user_mss;
  5474. - tcp_parse_options(skb, &tmp_opt, 0, NULL);
  5475. -
  5476. if (want_cookie && !tmp_opt.saw_tstamp)
  5477. tcp_clear_options(&tmp_opt);
  5478. tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
  5479. tcp_openreq_init(req, &tmp_opt, skb);
  5480. + if (mopt.saw_mpc && !want_cookie)
  5481. + mptcp_reqsk_new_mptcp(req, &tmp_opt, &mopt, skb);
  5482. +
  5483. ireq = inet_rsk(req);
  5484. ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
  5485. ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
  5486. @@ -1094,9 +1155,9 @@ drop:
  5487. return 0; /* don't send reset */
  5488. }
  5489. -static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  5490. - struct request_sock *req,
  5491. - struct dst_entry *dst)
  5492. +struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  5493. + struct request_sock *req,
  5494. + struct dst_entry *dst)
  5495. {
  5496. struct inet_request_sock *ireq;
  5497. struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
  5498. @@ -1317,7 +1378,7 @@ static __sum16 tcp_v6_checksum_init(struct sk_buff *skb)
  5499. * This is because we cannot sleep with the original spinlock
  5500. * held.
  5501. */
  5502. -static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
  5503. +int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
  5504. {
  5505. struct ipv6_pinfo *np = inet6_sk(sk);
  5506. struct tcp_sock *tp;
  5507. @@ -1339,6 +1400,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
  5508. goto discard;
  5509. #endif
  5510. + if (is_meta_sk(sk))
  5511. + return mptcp_v6_do_rcv(sk, skb);
  5512. +
  5513. if (sk_filter(sk, skb))
  5514. goto discard;
  5515. @@ -1460,7 +1524,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
  5516. {
  5517. const struct tcphdr *th;
  5518. const struct ipv6hdr *hdr;
  5519. - struct sock *sk;
  5520. + struct sock *sk, *meta_sk = NULL;
  5521. int ret;
  5522. struct net *net = dev_net(skb->dev);
  5523. @@ -1491,18 +1555,43 @@ static int tcp_v6_rcv(struct sk_buff *skb)
  5524. TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
  5525. skb->len - th->doff*4);
  5526. TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
  5527. +#ifdef CONFIG_MPTCP
  5528. + TCP_SKB_CB(skb)->mptcp_flags = 0;
  5529. + TCP_SKB_CB(skb)->dss_off = 0;
  5530. +#endif
  5531. TCP_SKB_CB(skb)->when = 0;
  5532. TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
  5533. TCP_SKB_CB(skb)->sacked = 0;
  5534. sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
  5535. - if (!sk)
  5536. - goto no_tcp_socket;
  5537. process:
  5538. - if (sk->sk_state == TCP_TIME_WAIT)
  5539. + if (sk && sk->sk_state == TCP_TIME_WAIT)
  5540. goto do_time_wait;
  5541. +#ifdef CONFIG_MPTCP
  5542. + if (!sk && th->syn && !th->ack) {
  5543. + int ret = mptcp_lookup_join(skb, NULL);
  5544. +
  5545. + if (ret < 0) {
  5546. + tcp_v6_send_reset(NULL, skb);
  5547. + goto discard_it;
  5548. + } else if (ret > 0) {
  5549. + return 0;
  5550. + }
  5551. + }
  5552. +
  5553. + /* Is there a pending request sock for this segment ? */
  5554. + if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
  5555. + if (sk)
  5556. + sock_put(sk);
  5557. + return 0;
  5558. + }
  5559. +#endif
  5560. +
  5561. + if (!sk)
  5562. + goto no_tcp_socket;
  5563. +
  5564. if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
  5565. NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
  5566. goto discard_and_relse;
  5567. @@ -1517,11 +1606,21 @@ process:
  5568. sk_mark_napi_id(sk, skb);
  5569. skb->dev = NULL;
  5570. - bh_lock_sock_nested(sk);
  5571. + if (tcp_sk(sk)->mpc) {
  5572. + meta_sk = mptcp_meta_sk(sk);
  5573. +
  5574. + bh_lock_sock_nested(meta_sk);
  5575. + if (sock_owned_by_user(meta_sk))
  5576. + skb->sk = sk;
  5577. + } else {
  5578. + meta_sk = sk;
  5579. + bh_lock_sock_nested(sk);
  5580. + }
  5581. +
  5582. ret = 0;
  5583. - if (!sock_owned_by_user(sk)) {
  5584. + if (!sock_owned_by_user(meta_sk)) {
  5585. #ifdef CONFIG_NET_DMA
  5586. - struct tcp_sock *tp = tcp_sk(sk);
  5587. + struct tcp_sock *tp = tcp_sk(meta_sk);
  5588. if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
  5589. tp->ucopy.dma_chan = net_dma_find_channel();
  5590. if (tp->ucopy.dma_chan)
  5591. @@ -1529,16 +1628,17 @@ process:
  5592. else
  5593. #endif
  5594. {
  5595. - if (!tcp_prequeue(sk, skb))
  5596. + if (!tcp_prequeue(meta_sk, skb))
  5597. ret = tcp_v6_do_rcv(sk, skb);
  5598. }
  5599. - } else if (unlikely(sk_add_backlog(sk, skb,
  5600. - sk->sk_rcvbuf + sk->sk_sndbuf))) {
  5601. - bh_unlock_sock(sk);
  5602. + } else if (unlikely(sk_add_backlog(meta_sk, skb,
  5603. + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
  5604. + bh_unlock_sock(meta_sk);
  5605. NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  5606. goto discard_and_relse;
  5607. }
  5608. - bh_unlock_sock(sk);
  5609. +
  5610. + bh_unlock_sock(meta_sk);
  5611. sock_put(sk);
  5612. return ret ? -1 : 0;
  5613. @@ -1595,6 +1695,18 @@ do_time_wait:
  5614. sk = sk2;
  5615. goto process;
  5616. }
  5617. +#ifdef CONFIG_MPTCP
  5618. + if (th->syn && !th->ack) {
  5619. + int ret = mptcp_lookup_join(skb, inet_twsk(sk));
  5620. +
  5621. + if (ret < 0) {
  5622. + tcp_v6_send_reset(NULL, skb);
  5623. + goto discard_it;
  5624. + } else if (ret > 0) {
  5625. + return 0;
  5626. + }
  5627. + }
  5628. +#endif
  5629. /* Fall through to ACK */
  5630. }
  5631. case TCP_TW_ACK:
  5632. @@ -1644,13 +1756,13 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
  5633. }
  5634. }
  5635. -static struct timewait_sock_ops tcp6_timewait_sock_ops = {
  5636. +struct timewait_sock_ops tcp6_timewait_sock_ops = {
  5637. .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
  5638. .twsk_unique = tcp_twsk_unique,
  5639. .twsk_destructor= tcp_twsk_destructor,
  5640. };
  5641. -static const struct inet_connection_sock_af_ops ipv6_specific = {
  5642. +const struct inet_connection_sock_af_ops ipv6_specific = {
  5643. .queue_xmit = inet6_csk_xmit,
  5644. .send_check = tcp_v6_send_check,
  5645. .rebuild_header = inet6_sk_rebuild_header,
  5646. @@ -1682,7 +1794,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
  5647. * TCP over IPv4 via INET6 API
  5648. */
  5649. -static const struct inet_connection_sock_af_ops ipv6_mapped = {
  5650. +const struct inet_connection_sock_af_ops ipv6_mapped = {
  5651. .queue_xmit = ip_queue_xmit,
  5652. .send_check = tcp_v4_send_check,
  5653. .rebuild_header = inet_sk_rebuild_header,
  5654. @@ -1727,7 +1839,7 @@ static int tcp_v6_init_sock(struct sock *sk)
  5655. return 0;
  5656. }
  5657. -static void tcp_v6_destroy_sock(struct sock *sk)
  5658. +void tcp_v6_destroy_sock(struct sock *sk)
  5659. {
  5660. tcp_v4_destroy_sock(sk);
  5661. inet6_destroy_sock(sk);
  5662. diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
  5663. new file mode 100644
  5664. index 0000000..88a05b1
  5665. --- /dev/null
  5666. +++ b/net/mptcp/Kconfig
  5667. @@ -0,0 +1,58 @@
  5668. +#
  5669. +# MPTCP configuration
  5670. +#
  5671. +config MPTCP
  5672. + bool "MPTCP protocol"
  5673. + depends on (IPV6=y || IPV6=n)
  5674. + ---help---
  5675. + This replaces the normal TCP stack with a Multipath TCP stack,
  5676. + able to use several paths at once.
  5677. +
  5678. +menuconfig MPTCP_PM_ADVANCED
  5679. + bool "MPTCP: advanced path-manager control"
  5680. + depends on MPTCP=y
  5681. + ---help---
  5682. + Support for selection of different path-managers. You should choose 'Y' here,
  5683. + because otherwise you will not actively create new MPTCP-subflows.
  5684. +
  5685. +if MPTCP_PM_ADVANCED
  5686. +
  5687. +config MPTCP_FULLMESH
  5688. + tristate "MPTCP Full-Mesh Path-Manager"
  5689. + depends on MPTCP=y
  5690. + ---help---
  5691. + This path-management module will create a full-mesh among all IP-addresses.
  5692. +
  5693. +config MPTCP_NDIFFPORTS
  5694. + tristate "MPTCP ndiff-ports"
  5695. + depends on MPTCP=y
  5696. + ---help---
  5697. + This path-management module will create multiple subflows between the same
  5698. + pair of IP-addresses, modifying the source-port. You can set the number
  5699. + of subflows via the mptcp_ndiffports-sysctl.
  5700. +
  5701. +choice
  5702. + prompt "Default MPTCP Path-Manager"
  5703. + default DEFAULT
  5704. + help
  5705. + Select the Path-Manager of your choice
  5706. +
  5707. + config DEFAULT_FULLMESH
  5708. + bool "Full mesh" if MPTCP_FULLMESH=y
  5709. +
  5710. + config DEFAULT_NDIFFPORTS
  5711. + bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
  5712. +
  5713. + config DEFAULT_DUMMY
  5714. + bool "Default"
  5715. +
  5716. +endchoice
  5717. +
  5718. +endif
  5719. +
  5720. +config DEFAULT_MPTCP_PM
  5721. + string
  5722. + default "default" if DEFAULT_DUMMY
  5723. + default "fullmesh" if DEFAULT_FULLMESH
  5724. + default "ndiffports" if DEFAULT_NDIFFPORTS
  5725. + default "default"
  5726. diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
  5727. new file mode 100644
  5728. index 0000000..e7238f5
  5729. --- /dev/null
  5730. +++ b/net/mptcp/Makefile
  5731. @@ -0,0 +1,18 @@
  5732. +#
  5733. +## Makefile for MultiPath TCP support code.
  5734. +#
  5735. +#
  5736. +
  5737. +obj-$(CONFIG_MPTCP) += mptcp.o
  5738. +
  5739. +mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \
  5740. + mptcp_output.o mptcp_input.o
  5741. +
  5742. +obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o
  5743. +obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
  5744. +obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
  5745. +obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
  5746. +obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
  5747. +
  5748. +mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
  5749. +
  5750. diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c
  5751. new file mode 100644
  5752. index 0000000..d71f96e
  5753. --- /dev/null
  5754. +++ b/net/mptcp/mptcp_coupled.c
  5755. @@ -0,0 +1,273 @@
  5756. +/*
  5757. + * MPTCP implementation - Coupled Congestion Control
  5758. + *
  5759. + * Initial Design & Implementation:
  5760. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  5761. + *
  5762. + * Current Maintainer & Author:
  5763. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  5764. + *
  5765. + * Additional authors:
  5766. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  5767. + * Gregory Detal <gregory.detal@uclouvain.be>
  5768. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  5769. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  5770. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  5771. + * Andreas Ripke <ripke@neclab.eu>
  5772. + * Vlad Dogaru <vlad.dogaru@intel.com>
  5773. + * Octavian Purdila <octavian.purdila@intel.com>
  5774. + * John Ronan <jronan@tssg.org>
  5775. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  5776. + * Brandon Heller <brandonh@stanford.edu>
  5777. + *
  5778. + *
  5779. + * This program is free software; you can redistribute it and/or
  5780. + * modify it under the terms of the GNU General Public License
  5781. + * as published by the Free Software Foundation; either version
  5782. + * 2 of the License, or (at your option) any later version.
  5783. + */
  5784. +#include <net/tcp.h>
  5785. +#include <net/mptcp.h>
  5786. +
  5787. +#include <linux/module.h>
  5788. +
  5789. +/* Scaling is done in the numerator with alpha_scale_num and in the denominator
  5790. + * with alpha_scale_den.
  5791. + *
  5792. + * To downscale, we just need to use alpha_scale.
  5793. + *
  5794. + * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
  5795. + */
  5796. +static int alpha_scale_den = 10;
  5797. +static int alpha_scale_num = 32;
  5798. +static int alpha_scale = 12;
  5799. +
  5800. +struct mptcp_ccc {
  5801. + u64 alpha;
  5802. + bool forced_update;
  5803. +};
  5804. +
  5805. +static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
  5806. +{
  5807. + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt;
  5808. +}
  5809. +
  5810. +static inline u64 mptcp_get_alpha(struct sock *meta_sk)
  5811. +{
  5812. + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
  5813. + return mptcp_ccc->alpha;
  5814. +}
  5815. +
  5816. +static inline void mptcp_set_alpha(struct sock *meta_sk, u64 alpha)
  5817. +{
  5818. + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
  5819. + mptcp_ccc->alpha = alpha;
  5820. +}
  5821. +
  5822. +static inline u64 mptcp_ccc_scale(u32 val, int scale)
  5823. +{
  5824. + return (u64) val << scale;
  5825. +}
  5826. +
  5827. +static inline bool mptcp_get_forced(struct sock *meta_sk)
  5828. +{
  5829. + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
  5830. + return mptcp_ccc->forced_update;
  5831. +}
  5832. +
  5833. +static inline void mptcp_set_forced(struct sock *meta_sk, bool force)
  5834. +{
  5835. + struct mptcp_ccc *mptcp_ccc = inet_csk_ca(meta_sk);
  5836. + mptcp_ccc->forced_update = force;
  5837. +}
  5838. +
  5839. +static void mptcp_ccc_recalc_alpha(struct sock *sk)
  5840. +{
  5841. + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
  5842. + struct sock *sub_sk;
  5843. + int best_cwnd = 0, best_rtt = 0, can_send = 0;
  5844. + u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
  5845. +
  5846. + if (!mpcb)
  5847. + return;
  5848. +
  5849. + /* Only one subflow left - fall back to normal reno-behavior
  5850. + * (set alpha to 1) */
  5851. + if (mpcb->cnt_established <= 1)
  5852. + goto exit;
  5853. +
  5854. + /* Do regular alpha-calculation for multiple subflows */
  5855. +
  5856. + /* Find the max numerator of the alpha-calculation */
  5857. + mptcp_for_each_sk(mpcb, sub_sk) {
  5858. + struct tcp_sock *sub_tp = tcp_sk(sub_sk);
  5859. + u64 tmp;
  5860. +
  5861. + if (!mptcp_ccc_sk_can_send(sub_sk))
  5862. + continue;
  5863. +
  5864. + can_send++;
  5865. +
  5866. + /* We need to look for the path, that provides the max-value.
  5867. + * Integer-overflow is not possible here, because
  5868. + * tmp will be in u64.
  5869. + */
  5870. + tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
  5871. + alpha_scale_num), (u64)sub_tp->srtt * sub_tp->srtt);
  5872. +
  5873. + if (tmp >= max_numerator) {
  5874. + max_numerator = tmp;
  5875. + best_cwnd = sub_tp->snd_cwnd;
  5876. + best_rtt = sub_tp->srtt;
  5877. + }
  5878. + }
  5879. +
  5880. + /* No subflow is able to send - we don't care anymore */
  5881. + if (unlikely(!can_send))
  5882. + goto exit;
  5883. +
  5884. + /* Calculate the denominator */
  5885. + mptcp_for_each_sk(mpcb, sub_sk) {
  5886. + struct tcp_sock *sub_tp = tcp_sk(sub_sk);
  5887. +
  5888. + if (!mptcp_ccc_sk_can_send(sub_sk))
  5889. + continue;
  5890. +
  5891. + sum_denominator += div_u64(
  5892. + mptcp_ccc_scale(sub_tp->snd_cwnd,
  5893. + alpha_scale_den) * best_rtt,
  5894. + sub_tp->srtt);
  5895. + }
  5896. + sum_denominator *= sum_denominator;
  5897. + if (unlikely(!sum_denominator)) {
  5898. + pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
  5899. + __func__, mpcb->cnt_established);
  5900. + mptcp_for_each_sk(mpcb, sub_sk) {
  5901. + struct tcp_sock *sub_tp = tcp_sk(sub_sk);
  5902. + pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
  5903. + __func__, sub_tp->mptcp->path_index,
  5904. + sub_sk->sk_state, sub_tp->srtt,
  5905. + sub_tp->snd_cwnd);
  5906. + }
  5907. + }
  5908. +
  5909. + alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
  5910. +
  5911. + if (unlikely(!alpha))
  5912. + alpha = 1;
  5913. +
  5914. +exit:
  5915. + mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
  5916. +}
  5917. +
  5918. +static void mptcp_ccc_init(struct sock *sk)
  5919. +{
  5920. + if (tcp_sk(sk)->mpc) {
  5921. + mptcp_set_forced(mptcp_meta_sk(sk), 0);
  5922. + mptcp_set_alpha(mptcp_meta_sk(sk), 1);
  5923. + }
  5924. + /* If we do not mptcp, behave like reno: return */
  5925. +}
  5926. +
  5927. +static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
  5928. +{
  5929. + if (event == CA_EVENT_LOSS)
  5930. + mptcp_ccc_recalc_alpha(sk);
  5931. +}
  5932. +
  5933. +static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
  5934. +{
  5935. + if (!tcp_sk(sk)->mpc)
  5936. + return;
  5937. +
  5938. + mptcp_set_forced(mptcp_meta_sk(sk), 1);
  5939. +}
  5940. +
  5941. +static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
  5942. +{
  5943. + struct tcp_sock *tp = tcp_sk(sk);
  5944. + struct mptcp_cb *mpcb = tp->mpcb;
  5945. + int snd_cwnd;
  5946. +
  5947. + if (!tp->mpc) {
  5948. + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
  5949. + return;
  5950. + }
  5951. +
  5952. + if (!tcp_is_cwnd_limited(sk, in_flight))
  5953. + return;
  5954. +
  5955. + if (tp->snd_cwnd <= tp->snd_ssthresh) {
  5956. + /* In "safe" area, increase. */
  5957. + tcp_slow_start(tp, acked);
  5958. + mptcp_ccc_recalc_alpha(sk);
  5959. + return;
  5960. + }
  5961. +
  5962. + if (mptcp_get_forced(mptcp_meta_sk(sk))) {
  5963. + mptcp_ccc_recalc_alpha(sk);
  5964. + mptcp_set_forced(mptcp_meta_sk(sk), 0);
  5965. + }
  5966. +
  5967. + if (mpcb->cnt_established > 1) {
  5968. + u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
  5969. +
  5970. + /* This may happen, if at the initialization, the mpcb
  5971. + * was not yet attached to the sock, and thus
  5972. + * initializing alpha failed.
  5973. + */
  5974. + if (unlikely(!alpha))
  5975. + alpha = 1;
  5976. +
  5977. + snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
  5978. + alpha);
  5979. +
  5980. + /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
  5981. + * Thus, we select here the max value. */
  5982. + if (snd_cwnd < tp->snd_cwnd)
  5983. + snd_cwnd = tp->snd_cwnd;
  5984. + } else {
  5985. + snd_cwnd = tp->snd_cwnd;
  5986. + }
  5987. +
  5988. + if (tp->snd_cwnd_cnt >= snd_cwnd) {
  5989. + if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
  5990. + tp->snd_cwnd++;
  5991. + mptcp_ccc_recalc_alpha(sk);
  5992. + }
  5993. +
  5994. + tp->snd_cwnd_cnt = 0;
  5995. + } else {
  5996. + tp->snd_cwnd_cnt++;
  5997. + }
  5998. +}
  5999. +
  6000. +static struct tcp_congestion_ops mptcp_ccc = {
  6001. + .init = mptcp_ccc_init,
  6002. + .ssthresh = tcp_reno_ssthresh,
  6003. + .cong_avoid = mptcp_ccc_cong_avoid,
  6004. + .cwnd_event = mptcp_ccc_cwnd_event,
  6005. + .set_state = mptcp_ccc_set_state,
  6006. + .min_cwnd = tcp_reno_min_cwnd,
  6007. + .owner = THIS_MODULE,
  6008. + .name = "coupled",
  6009. +};
  6010. +
  6011. +static int __init mptcp_ccc_register(void)
  6012. +{
  6013. + BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
  6014. + return tcp_register_congestion_control(&mptcp_ccc);
  6015. +}
  6016. +
  6017. +static void __exit mptcp_ccc_unregister(void)
  6018. +{
  6019. + tcp_unregister_congestion_control(&mptcp_ccc);
  6020. +}
  6021. +
  6022. +module_init(mptcp_ccc_register);
  6023. +module_exit(mptcp_ccc_unregister);
  6024. +
  6025. +MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
  6026. +MODULE_LICENSE("GPL");
  6027. +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
  6028. +MODULE_VERSION("0.1");
  6029. diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
  6030. new file mode 100644
  6031. index 0000000..6a7654d
  6032. --- /dev/null
  6033. +++ b/net/mptcp/mptcp_ctrl.c
  6034. @@ -0,0 +1,2270 @@
  6035. +/*
  6036. + * MPTCP implementation - MPTCP-control
  6037. + *
  6038. + * Initial Design & Implementation:
  6039. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  6040. + *
  6041. + * Current Maintainer & Author:
  6042. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  6043. + *
  6044. + * Additional authors:
  6045. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  6046. + * Gregory Detal <gregory.detal@uclouvain.be>
  6047. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  6048. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  6049. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  6050. + * Andreas Ripke <ripke@neclab.eu>
  6051. + * Vlad Dogaru <vlad.dogaru@intel.com>
  6052. + * Octavian Purdila <octavian.purdila@intel.com>
  6053. + * John Ronan <jronan@tssg.org>
  6054. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  6055. + * Brandon Heller <brandonh@stanford.edu>
  6056. + *
  6057. + *
  6058. + * This program is free software; you can redistribute it and/or
  6059. + * modify it under the terms of the GNU General Public License
  6060. + * as published by the Free Software Foundation; either version
  6061. + * 2 of the License, or (at your option) any later version.
  6062. + */
  6063. +
  6064. +#include <net/inet_common.h>
  6065. +#include <net/inet6_hashtables.h>
  6066. +#include <net/ipv6.h>
  6067. +#include <net/ip6_checksum.h>
  6068. +#include <net/mptcp.h>
  6069. +#include <net/mptcp_v4.h>
  6070. +#if IS_ENABLED(CONFIG_IPV6)
  6071. +#include <net/mptcp_v6.h>
  6072. +#endif
  6073. +#include <net/sock.h>
  6074. +#include <net/tcp.h>
  6075. +#include <net/tcp_states.h>
  6076. +#include <net/transp_v6.h>
  6077. +#include <net/xfrm.h>
  6078. +
  6079. +#include <linux/cryptohash.h>
  6080. +#include <linux/kconfig.h>
  6081. +#include <linux/module.h>
  6082. +#include <linux/netpoll.h>
  6083. +#include <linux/list.h>
  6084. +#include <linux/jhash.h>
  6085. +#include <linux/tcp.h>
  6086. +#include <linux/net.h>
  6087. +#include <linux/in.h>
  6088. +#include <linux/random.h>
  6089. +#include <linux/inetdevice.h>
  6090. +#include <linux/workqueue.h>
  6091. +#include <linux/atomic.h>
  6092. +#include <linux/sysctl.h>
  6093. +
  6094. +static struct kmem_cache *mptcp_sock_cache __read_mostly;
  6095. +static struct kmem_cache *mptcp_cb_cache __read_mostly;
  6096. +static struct kmem_cache *mptcp_tw_cache __read_mostly;
  6097. +
  6098. +int sysctl_mptcp_enabled __read_mostly = 1;
  6099. +int sysctl_mptcp_checksum __read_mostly = 1;
  6100. +int sysctl_mptcp_debug __read_mostly;
  6101. +EXPORT_SYMBOL(sysctl_mptcp_debug);
  6102. +int sysctl_mptcp_syn_retries __read_mostly = 3;
  6103. +
  6104. +bool mptcp_init_failed __read_mostly;
  6105. +
  6106. +static int proc_mptcp_path_manager(ctl_table *ctl, int write,
  6107. + void __user *buffer, size_t *lenp,
  6108. + loff_t *ppos)
  6109. +{
  6110. + char val[MPTCP_PM_NAME_MAX];
  6111. + ctl_table tbl = {
  6112. + .data = val,
  6113. + .maxlen = MPTCP_PM_NAME_MAX,
  6114. + };
  6115. + int ret;
  6116. +
  6117. + mptcp_get_default_path_manager(val);
  6118. +
  6119. + ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
  6120. + if (write && ret == 0)
  6121. + ret = mptcp_set_default_path_manager(val);
  6122. + return ret;
  6123. +}
  6124. +
  6125. +static struct ctl_table mptcp_table[] = {
  6126. + {
  6127. + .procname = "mptcp_enabled",
  6128. + .data = &sysctl_mptcp_enabled,
  6129. + .maxlen = sizeof(int),
  6130. + .mode = 0644,
  6131. + .proc_handler = &proc_dointvec
  6132. + },
  6133. + {
  6134. + .procname = "mptcp_checksum",
  6135. + .data = &sysctl_mptcp_checksum,
  6136. + .maxlen = sizeof(int),
  6137. + .mode = 0644,
  6138. + .proc_handler = &proc_dointvec
  6139. + },
  6140. + {
  6141. + .procname = "mptcp_debug",
  6142. + .data = &sysctl_mptcp_debug,
  6143. + .maxlen = sizeof(int),
  6144. + .mode = 0644,
  6145. + .proc_handler = &proc_dointvec
  6146. + },
  6147. + {
  6148. + .procname = "mptcp_syn_retries",
  6149. + .data = &sysctl_mptcp_syn_retries,
  6150. + .maxlen = sizeof(int),
  6151. + .mode = 0644,
  6152. + .proc_handler = &proc_dointvec
  6153. + },
  6154. + {
  6155. + .procname = "mptcp_path_manager",
  6156. + .mode = 0644,
  6157. + .maxlen = MPTCP_PM_NAME_MAX,
  6158. + .proc_handler = proc_mptcp_path_manager,
  6159. + },
  6160. + { }
  6161. +};
  6162. +
  6163. +static inline u32 mptcp_hash_tk(u32 token)
  6164. +{
  6165. + return token % MPTCP_HASH_SIZE;
  6166. +}
  6167. +
  6168. +struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
  6169. +EXPORT_SYMBOL(tk_hashtable);
  6170. +
  6171. +/* This second hashtable is needed to retrieve request socks
  6172. + * created as a result of a join request. While the SYN contains
  6173. + * the token, the final ack does not, so we need a separate hashtable
  6174. + * to retrieve the mpcb.
  6175. + */
  6176. +struct list_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
  6177. +spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
  6178. +
  6179. +/* The following hash table is used to avoid collision of token */
  6180. +static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
  6181. +spinlock_t mptcp_tk_hashlock; /* hashtable protection */
  6182. +
  6183. +static int mptcp_reqsk_find_tk(u32 token)
  6184. +{
  6185. + u32 hash = mptcp_hash_tk(token);
  6186. + struct mptcp_request_sock *mtreqsk;
  6187. + const struct hlist_nulls_node *node;
  6188. +
  6189. + hlist_nulls_for_each_entry_rcu(mtreqsk, node,
  6190. + &mptcp_reqsk_tk_htb[hash], collide_tk) {
  6191. + if (token == mtreqsk->mptcp_loc_token)
  6192. + return 1;
  6193. + }
  6194. + return 0;
  6195. +}
  6196. +
  6197. +static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, u32 token)
  6198. +{
  6199. + u32 hash = mptcp_hash_tk(token);
  6200. +
  6201. + hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->collide_tk,
  6202. + &mptcp_reqsk_tk_htb[hash]);
  6203. +}
  6204. +
  6205. +static void mptcp_reqsk_remove_tk(struct request_sock *reqsk)
  6206. +{
  6207. + rcu_read_lock();
  6208. + spin_lock(&mptcp_tk_hashlock);
  6209. + hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->collide_tk);
  6210. + spin_unlock(&mptcp_tk_hashlock);
  6211. + rcu_read_unlock();
  6212. +}
  6213. +
  6214. +void mptcp_reqsk_destructor(struct request_sock *req)
  6215. +{
  6216. + if (!mptcp_rsk(req)->mpcb) {
  6217. + if (in_softirq()) {
  6218. + mptcp_reqsk_remove_tk(req);
  6219. + } else {
  6220. + rcu_read_lock_bh();
  6221. + spin_lock(&mptcp_tk_hashlock);
  6222. + hlist_nulls_del_init_rcu(&mptcp_rsk(req)->collide_tk);
  6223. + spin_unlock(&mptcp_tk_hashlock);
  6224. + rcu_read_unlock_bh();
  6225. + }
  6226. + } else {
  6227. + mptcp_hash_request_remove(req);
  6228. + }
  6229. +}
  6230. +
  6231. +static void __mptcp_hash_insert(struct tcp_sock *meta_tp, u32 token)
  6232. +{
  6233. + u32 hash = mptcp_hash_tk(token);
  6234. + hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
  6235. + meta_tp->inside_tk_table = 1;
  6236. +}
  6237. +
  6238. +static int mptcp_find_token(u32 token)
  6239. +{
  6240. + u32 hash = mptcp_hash_tk(token);
  6241. + struct tcp_sock *meta_tp;
  6242. + const struct hlist_nulls_node *node;
  6243. +
  6244. + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
  6245. + if (token == meta_tp->mptcp_loc_token)
  6246. + return 1;
  6247. + }
  6248. + return 0;
  6249. +}
  6250. +
  6251. +static void mptcp_set_key_reqsk(struct request_sock *req,
  6252. + const struct sk_buff *skb)
  6253. +{
  6254. + struct inet_request_sock *ireq = inet_rsk(req);
  6255. + struct mptcp_request_sock *mtreq = mptcp_rsk(req);
  6256. +
  6257. + if (skb->protocol == htons(ETH_P_IP)) {
  6258. + mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
  6259. + ip_hdr(skb)->daddr,
  6260. + htons(ireq->ir_num),
  6261. + ireq->ir_rmt_port);
  6262. +#if IS_ENABLED(CONFIG_IPV6)
  6263. + } else {
  6264. + mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
  6265. + ipv6_hdr(skb)->daddr.s6_addr32,
  6266. + htons(ireq->ir_num),
  6267. + ireq->ir_rmt_port);
  6268. +#endif
  6269. + }
  6270. +
  6271. + mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
  6272. +}
  6273. +
  6274. +/* New MPTCP-connection request, prepare a new token for the meta-socket that
  6275. + * will be created in mptcp_check_req_master(), and store the received token.
  6276. + */
  6277. +void mptcp_reqsk_new_mptcp(struct request_sock *req,
  6278. + const struct tcp_options_received *rx_opt,
  6279. + const struct mptcp_options_received *mopt,
  6280. + const struct sk_buff *skb)
  6281. +{
  6282. + struct mptcp_request_sock *mtreq = mptcp_rsk(req);
  6283. +
  6284. + tcp_rsk(req)->saw_mpc = 1;
  6285. +
  6286. + rcu_read_lock();
  6287. + spin_lock(&mptcp_tk_hashlock);
  6288. + do {
  6289. + mptcp_set_key_reqsk(req, skb);
  6290. + } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
  6291. + mptcp_find_token(mtreq->mptcp_loc_token));
  6292. +
  6293. + mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
  6294. + spin_unlock(&mptcp_tk_hashlock);
  6295. + rcu_read_unlock();
  6296. + mtreq->mptcp_rem_key = mopt->mptcp_key;
  6297. +}
  6298. +
  6299. +static void mptcp_set_key_sk(struct sock *sk)
  6300. +{
  6301. + struct tcp_sock *tp = tcp_sk(sk);
  6302. + struct inet_sock *isk = inet_sk(sk);
  6303. +
  6304. + if (sk->sk_family == AF_INET)
  6305. + tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
  6306. + isk->inet_daddr,
  6307. + isk->inet_sport,
  6308. + isk->inet_dport);
  6309. +#if IS_ENABLED(CONFIG_IPV6)
  6310. + else
  6311. + tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
  6312. + sk->sk_v6_daddr.s6_addr32,
  6313. + isk->inet_sport,
  6314. + isk->inet_dport);
  6315. +#endif
  6316. +
  6317. + mptcp_key_sha1(tp->mptcp_loc_key,
  6318. + &tp->mptcp_loc_token, NULL);
  6319. +}
  6320. +
  6321. +void mptcp_connect_init(struct sock *sk)
  6322. +{
  6323. + struct tcp_sock *tp = tcp_sk(sk);
  6324. +
  6325. + rcu_read_lock_bh();
  6326. + spin_lock(&mptcp_tk_hashlock);
  6327. + do {
  6328. + mptcp_set_key_sk(sk);
  6329. + } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
  6330. + mptcp_find_token(tp->mptcp_loc_token));
  6331. +
  6332. + __mptcp_hash_insert(tp, tp->mptcp_loc_token);
  6333. + spin_unlock(&mptcp_tk_hashlock);
  6334. + rcu_read_unlock_bh();
  6335. +}
  6336. +
  6337. +/**
  6338. + * This function increments the refcount of the mpcb struct.
  6339. + * It is the responsibility of the caller to decrement when releasing
  6340. + * the structure.
  6341. + */
  6342. +struct sock *mptcp_hash_find(struct net *net, u32 token)
  6343. +{
  6344. + u32 hash = mptcp_hash_tk(token);
  6345. + struct tcp_sock *meta_tp;
  6346. + struct sock *meta_sk = NULL;
  6347. + struct hlist_nulls_node *node;
  6348. +
  6349. + rcu_read_lock();
  6350. + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
  6351. + tk_table) {
  6352. + meta_sk = (struct sock *)meta_tp;
  6353. + if (token == meta_tp->mptcp_loc_token &&
  6354. + net_eq(net, sock_net(meta_sk)) &&
  6355. + atomic_inc_not_zero(&meta_sk->sk_refcnt))
  6356. + break;
  6357. + meta_sk = NULL;
  6358. + }
  6359. + rcu_read_unlock();
  6360. + return meta_sk;
  6361. +}
  6362. +
  6363. +void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
  6364. +{
  6365. + /* remove from the token hashtable */
  6366. + rcu_read_lock_bh();
  6367. + spin_lock(&mptcp_tk_hashlock);
  6368. + hlist_nulls_del_init_rcu(&meta_tp->tk_table);
  6369. + meta_tp->inside_tk_table = 0;
  6370. + spin_unlock(&mptcp_tk_hashlock);
  6371. + rcu_read_unlock_bh();
  6372. +}
  6373. +
  6374. +void mptcp_hash_remove(struct tcp_sock *meta_tp)
  6375. +{
  6376. + rcu_read_lock();
  6377. + spin_lock(&mptcp_tk_hashlock);
  6378. + hlist_nulls_del_init_rcu(&meta_tp->tk_table);
  6379. + meta_tp->inside_tk_table = 0;
  6380. + spin_unlock(&mptcp_tk_hashlock);
  6381. + rcu_read_unlock();
  6382. +}
  6383. +
  6384. +static struct sock *mptcp_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  6385. + struct request_sock *req,
  6386. + struct dst_entry *dst)
  6387. +{
  6388. +#if IS_ENABLED(CONFIG_IPV6)
  6389. + if (sk->sk_family == AF_INET6)
  6390. + return tcp_v6_syn_recv_sock(sk, skb, req, dst);
  6391. +
  6392. + /* sk->sk_family == AF_INET */
  6393. + if (req->rsk_ops->family == AF_INET6)
  6394. + return mptcp_v6v4_syn_recv_sock(sk, skb, req, dst);
  6395. +#endif
  6396. +
  6397. + /* sk->sk_family == AF_INET && req->rsk_ops->family == AF_INET */
  6398. + return tcp_v4_syn_recv_sock(sk, skb, req, dst);
  6399. +}
  6400. +
  6401. +struct sock *mptcp_select_ack_sock(const struct sock *meta_sk, int copied)
  6402. +{
  6403. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  6404. + struct sock *sk, *subsk = NULL;
  6405. + u32 max_data_seq = 0;
  6406. + /* max_data_seq initialized to correct compiler-warning.
  6407. + * But the initialization is handled by max_data_seq_set
  6408. + */
  6409. + short max_data_seq_set = 0;
  6410. + u32 min_time = 0xffffffff;
  6411. +
  6412. + /* How do we select the subflow to send the window-update on?
  6413. + *
  6414. + * 1. He has to be in a state where he can send an ack and is
  6415. + * operational (pf = 0).
  6416. + * 2. He has to be one of those subflow who recently
  6417. + * contributed to the received stream
  6418. + * (this guarantees a working subflow)
  6419. + * a) its latest data_seq received is after the original
  6420. + * copied_seq.
  6421. + * We select the one with the lowest rtt, so that the
  6422. + * window-update reaches our peer the fastest.
  6423. + * b) if no subflow has this kind of data_seq (e.g., very
  6424. + * strange meta-level retransmissions going on), we take
  6425. + * the subflow who last sent the highest data_seq.
  6426. + */
  6427. + mptcp_for_each_sk(meta_tp->mpcb, sk) {
  6428. + struct tcp_sock *tp = tcp_sk(sk);
  6429. +
  6430. + if (!mptcp_sk_can_send_ack(sk) || tp->pf)
  6431. + continue;
  6432. +
  6433. + /* Select among those who contributed to the
  6434. + * current receive-queue.
  6435. + */
  6436. + if (copied && after(tp->mptcp->last_data_seq, meta_tp->copied_seq - copied)) {
  6437. + if (tp->srtt < min_time) {
  6438. + min_time = tp->srtt;
  6439. + subsk = sk;
  6440. + max_data_seq_set = 0;
  6441. + }
  6442. + continue;
  6443. + }
  6444. +
  6445. + if (!subsk && !max_data_seq_set) {
  6446. + max_data_seq = tp->mptcp->last_data_seq;
  6447. + max_data_seq_set = 1;
  6448. + subsk = sk;
  6449. + }
  6450. +
  6451. + /* Otherwise, take the one with the highest data_seq */
  6452. + if ((!subsk || max_data_seq_set) &&
  6453. + after(tp->mptcp->last_data_seq, max_data_seq)) {
  6454. + max_data_seq = tp->mptcp->last_data_seq;
  6455. + subsk = sk;
  6456. + }
  6457. + }
  6458. +
  6459. + if (!subsk) {
  6460. + mptcp_debug("%s subsk is null, copied %d, cseq %u\n", __func__,
  6461. + copied, meta_tp->copied_seq);
  6462. + mptcp_for_each_sk(meta_tp->mpcb, sk) {
  6463. + struct tcp_sock *tp = tcp_sk(sk);
  6464. + mptcp_debug("%s pi %d state %u last_dseq %u\n",
  6465. + __func__, tp->mptcp->path_index, sk->sk_state,
  6466. + tp->mptcp->last_data_seq);
  6467. + }
  6468. + }
  6469. +
  6470. + return subsk;
  6471. +}
  6472. +EXPORT_SYMBOL(mptcp_select_ack_sock);
  6473. +
  6474. +static void mptcp_sock_def_error_report(struct sock *sk)
  6475. +{
  6476. + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
  6477. +
  6478. + if (!sock_flag(sk, SOCK_DEAD))
  6479. + mptcp_sub_close(sk, 0);
  6480. +
  6481. + if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
  6482. + mpcb->send_infinite_mapping) {
  6483. + struct sock *meta_sk = mptcp_meta_sk(sk);
  6484. +
  6485. + meta_sk->sk_err = sk->sk_err;
  6486. + meta_sk->sk_err_soft = sk->sk_err_soft;
  6487. +
  6488. + if (!sock_flag(meta_sk, SOCK_DEAD))
  6489. + meta_sk->sk_error_report(meta_sk);
  6490. +
  6491. + tcp_done(meta_sk);
  6492. + }
  6493. +
  6494. + sk->sk_err = 0;
  6495. + return;
  6496. +}
  6497. +
  6498. +static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
  6499. +{
  6500. + if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {
  6501. + mptcp_cleanup_path_manager(mpcb);
  6502. + kmem_cache_free(mptcp_cb_cache, mpcb);
  6503. + }
  6504. +}
  6505. +
  6506. +static void mptcp_sock_destruct(struct sock *sk)
  6507. +{
  6508. + struct tcp_sock *tp = tcp_sk(sk);
  6509. +
  6510. + inet_sock_destruct(sk);
  6511. +
  6512. + BUG_ON(!list_empty(&tp->mptcp->cb_list));
  6513. +
  6514. + kmem_cache_free(mptcp_sock_cache, tp->mptcp);
  6515. + tp->mptcp = NULL;
  6516. +
  6517. + if (!is_meta_sk(sk) && !tp->was_meta_sk) {
  6518. + /* Taken when mpcb pointer was set */
  6519. + sock_put(mptcp_meta_sk(sk));
  6520. + mptcp_mpcb_put(tp->mpcb);
  6521. + } else {
  6522. + struct mptcp_cb *mpcb = tp->mpcb;
  6523. + struct mptcp_tw *mptw;
  6524. +
  6525. + /* The mpcb is disappearing - we can make the final
  6526. + * update to the rcv_nxt of the time-wait-sock and remove
  6527. + * its reference to the mpcb.
  6528. + */
  6529. + spin_lock_bh(&mpcb->tw_lock);
  6530. + list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
  6531. + list_del_rcu(&mptw->list);
  6532. + mptw->in_list = 0;
  6533. + mptcp_mpcb_put(mpcb);
  6534. + rcu_assign_pointer(mptw->mpcb, NULL);
  6535. + }
  6536. + spin_unlock_bh(&mpcb->tw_lock);
  6537. +
  6538. + mptcp_mpcb_put(mpcb);
  6539. +
  6540. + mptcp_debug("%s destroying meta-sk\n", __func__);
  6541. + }
  6542. +}
  6543. +
  6544. +void mptcp_destroy_sock(struct sock *sk)
  6545. +{
  6546. + if (is_meta_sk(sk)) {
  6547. + struct sock *sk_it, *tmpsk;
  6548. +
  6549. + __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
  6550. + mptcp_purge_ofo_queue(tcp_sk(sk));
  6551. +
  6552. + /* We have to close all remaining subflows. Normally, they
  6553. + * should all be about to get closed. But, if the kernel is
  6554. + * forcing a closure (e.g., tcp_write_err), the subflows might
  6555. + * not have been closed properly (as we are waiting for the
  6556. + * DATA_ACK of the DATA_FIN).
  6557. + */
  6558. + mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
  6559. + /* Already did call tcp_close - waiting for graceful
  6560. + * closure, or if we are retransmitting fast-close on
  6561. + * the subflow. The reset (or timeout) will kill the
  6562. + * subflow..
  6563. + */
  6564. + if (tcp_sk(sk_it)->closing ||
  6565. + tcp_sk(sk_it)->send_mp_fclose)
  6566. + continue;
  6567. +
  6568. + /* Allow the delayed work first to prevent time-wait state */
  6569. + if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
  6570. + continue;
  6571. +
  6572. + mptcp_sub_close(sk_it, 0);
  6573. + }
  6574. + } else {
  6575. + mptcp_del_sock(sk);
  6576. + }
  6577. +}
  6578. +
  6579. +static void mptcp_set_state(struct sock *sk)
  6580. +{
  6581. + struct sock *meta_sk = mptcp_meta_sk(sk);
  6582. +
  6583. + /* Meta is not yet established - wake up the application */
  6584. + if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
  6585. + sk->sk_state == TCP_ESTABLISHED) {
  6586. + tcp_set_state(meta_sk, TCP_ESTABLISHED);
  6587. +
  6588. + if (!sock_flag(meta_sk, SOCK_DEAD)) {
  6589. + meta_sk->sk_state_change(meta_sk);
  6590. + sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
  6591. + }
  6592. + }
  6593. +
  6594. + if (sk->sk_state == TCP_ESTABLISHED) {
  6595. + tcp_sk(sk)->mptcp->establish_increased = 1;
  6596. + tcp_sk(sk)->mpcb->cnt_established++;
  6597. + }
  6598. +}
  6599. +
  6600. +u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
  6601. +u32 mptcp_key_seed = 0;
  6602. +
  6603. +void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
  6604. +{
  6605. + u32 workspace[SHA_WORKSPACE_WORDS];
  6606. + u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
  6607. + u8 input[64];
  6608. + int i;
  6609. +
  6610. + memset(workspace, 0, sizeof(workspace));
  6611. +
  6612. + /* Initialize input with appropriate padding */
  6613. + memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
  6614. + * is explicitly set too */
  6615. + memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
  6616. + input[8] = 0x80; /* Padding: First bit after message = 1 */
  6617. + input[63] = 0x40; /* Padding: Length of the message = 64 bits */
  6618. +
  6619. + sha_init(mptcp_hashed_key);
  6620. + sha_transform(mptcp_hashed_key, input, workspace);
  6621. +
  6622. + for (i = 0; i < 5; i++)
  6623. + mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
  6624. +
  6625. + if (token)
  6626. + *token = mptcp_hashed_key[0];
  6627. + if (idsn)
  6628. + *idsn = *((u64 *)&mptcp_hashed_key[3]);
  6629. +}
  6630. +
  6631. +void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
  6632. + u32 *hash_out)
  6633. +{
  6634. + u32 workspace[SHA_WORKSPACE_WORDS];
  6635. + u8 input[128]; /* 2 512-bit blocks */
  6636. + int i;
  6637. +
  6638. + memset(workspace, 0, sizeof(workspace));
  6639. +
  6640. + /* Generate key xored with ipad */
  6641. + memset(input, 0x36, 64);
  6642. + for (i = 0; i < 8; i++)
  6643. + input[i] ^= key_1[i];
  6644. + for (i = 0; i < 8; i++)
  6645. + input[i + 8] ^= key_2[i];
  6646. +
  6647. + memcpy(&input[64], rand_1, 4);
  6648. + memcpy(&input[68], rand_2, 4);
  6649. + input[72] = 0x80; /* Padding: First bit after message = 1 */
  6650. + memset(&input[73], 0, 53);
  6651. +
  6652. + /* Padding: Length of the message = 512 + 64 bits */
  6653. + input[126] = 0x02;
  6654. + input[127] = 0x40;
  6655. +
  6656. + sha_init(hash_out);
  6657. + sha_transform(hash_out, input, workspace);
  6658. + memset(workspace, 0, sizeof(workspace));
  6659. +
  6660. + sha_transform(hash_out, &input[64], workspace);
  6661. + memset(workspace, 0, sizeof(workspace));
  6662. +
  6663. + for (i = 0; i < 5; i++)
  6664. + hash_out[i] = cpu_to_be32(hash_out[i]);
  6665. +
  6666. + /* Prepare second part of hmac */
  6667. + memset(input, 0x5C, 64);
  6668. + for (i = 0; i < 8; i++)
  6669. + input[i] ^= key_1[i];
  6670. + for (i = 0; i < 8; i++)
  6671. + input[i + 8] ^= key_2[i];
  6672. +
  6673. + memcpy(&input[64], hash_out, 20);
  6674. + input[84] = 0x80;
  6675. + memset(&input[85], 0, 41);
  6676. +
  6677. + /* Padding: Length of the message = 512 + 160 bits */
  6678. + input[126] = 0x02;
  6679. + input[127] = 0xA0;
  6680. +
  6681. + sha_init(hash_out);
  6682. + sha_transform(hash_out, input, workspace);
  6683. + memset(workspace, 0, sizeof(workspace));
  6684. +
  6685. + sha_transform(hash_out, &input[64], workspace);
  6686. +
  6687. + for (i = 0; i < 5; i++)
  6688. + hash_out[i] = cpu_to_be32(hash_out[i]);
  6689. +}
  6690. +
  6691. +static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
  6692. +{
  6693. + /* Socket-options handled by mptcp_inherit_sk while creating the meta-sk.
  6694. + * ======
  6695. + * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
  6696. + * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
  6697. + * TCP_NODELAY, TCP_CORK
  6698. + *
  6699. + * Socket-options handled in this function here
  6700. + * ======
  6701. + * TCP_DEFER_ACCEPT
  6702. + *
  6703. + * Socket-options on the todo-list
  6704. + * ======
  6705. + * SO_BINDTODEVICE - should probably prevent creation of new subsocks
  6706. + * across other devices. - what about the api-draft?
  6707. + * SO_DEBUG
  6708. + * SO_REUSEADDR - probably we don't care about this
  6709. + * SO_DONTROUTE, SO_BROADCAST
  6710. + * SO_OOBINLINE
  6711. + * SO_LINGER
  6712. + * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
  6713. + * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
  6714. + * SO_RXQ_OVFL
  6715. + * TCP_COOKIE_TRANSACTIONS
  6716. + * TCP_MAXSEG
  6717. + * TCP_THIN_* - Handled by mptcp_inherit_sk, but we need to support this
  6718. + * in mptcp_retransmit_timer. AND we need to check what is
  6719. + * about the subsockets.
  6720. + * TCP_LINGER2
  6721. + * TCP_WINDOW_CLAMP
  6722. + * TCP_USER_TIMEOUT
  6723. + * TCP_MD5SIG
  6724. + *
  6725. + * Socket-options of no concern for the meta-socket (but for the subsocket)
  6726. + * ======
  6727. + * SO_PRIORITY
  6728. + * SO_MARK
  6729. + * TCP_CONGESTION
  6730. + * TCP_SYNCNT
  6731. + * TCP_QUICKACK
  6732. + * SO_KEEPALIVE
  6733. + */
  6734. +
  6735. + /****** DEFER_ACCEPT-handler ******/
  6736. +
  6737. + /* DEFER_ACCEPT is not of concern for new subflows - we always accept
  6738. + * them
  6739. + */
  6740. + inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
  6741. +}
  6742. +
  6743. +static void mptcp_sub_inherit_sockopts(struct sock *meta_sk, struct sock *sub_sk)
  6744. +{
  6745. + /* IP_TOS also goes to the subflow. */
  6746. + if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
  6747. + inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
  6748. + sub_sk->sk_priority = meta_sk->sk_priority;
  6749. + sk_dst_reset(sub_sk);
  6750. + }
  6751. +
  6752. + /* Inherit SO_REUSEADDR */
  6753. + sub_sk->sk_reuse = meta_sk->sk_reuse;
  6754. +
  6755. + /* Inherit snd/rcv-buffer locks */
  6756. + sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
  6757. +}
  6758. +
  6759. +int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
  6760. +{
  6761. + /* skb-sk may be NULL if we receive a packet immediatly after the
  6762. + * SYN/ACK + MP_CAPABLE.
  6763. + */
  6764. + struct sock *sk = skb->sk ? skb->sk : meta_sk;
  6765. + int ret = 0;
  6766. +
  6767. + skb->sk = NULL;
  6768. +
  6769. + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
  6770. + kfree_skb(skb);
  6771. + return 0;
  6772. + }
  6773. +
  6774. + if (sk->sk_family == AF_INET)
  6775. + ret = tcp_v4_do_rcv(sk, skb);
  6776. +#if IS_ENABLED(CONFIG_IPV6)
  6777. + else
  6778. + ret = tcp_v6_do_rcv(sk, skb);
  6779. +#endif
  6780. +
  6781. + sock_put(sk);
  6782. + return ret;
  6783. +}
  6784. +
  6785. +struct lock_class_key meta_key;
  6786. +struct lock_class_key meta_slock_key;
  6787. +
  6788. +/* Code heavily inspired from sk_clone() */
  6789. +static int mptcp_inherit_sk(const struct sock *sk, struct sock *newsk,
  6790. + int family, const gfp_t flags)
  6791. +{
  6792. + struct sk_filter *filter;
  6793. + struct proto *prot = newsk->sk_prot;
  6794. + const struct inet_connection_sock_af_ops *af_ops = inet_csk(newsk)->icsk_af_ops;
  6795. +#ifdef CONFIG_SECURITY_NETWORK
  6796. + void *sptr = newsk->sk_security;
  6797. +#endif
  6798. +
  6799. + if (sk->sk_family == AF_INET) {
  6800. + memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin));
  6801. + memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end,
  6802. + sizeof(struct tcp_sock) - offsetof(struct sock, sk_dontcopy_end));
  6803. + } else {
  6804. + memcpy(newsk, sk, offsetof(struct sock, sk_dontcopy_begin));
  6805. + memcpy(&newsk->sk_dontcopy_end, &sk->sk_dontcopy_end,
  6806. + sizeof(struct tcp6_sock) - offsetof(struct sock, sk_dontcopy_end));
  6807. + }
  6808. +
  6809. +#ifdef CONFIG_SECURITY_NETWORK
  6810. + newsk->sk_security = sptr;
  6811. + security_sk_clone(sk, newsk);
  6812. +#endif
  6813. +
  6814. + /* Has been changed by sock_copy above - we may need an IPv6-socket */
  6815. + newsk->sk_family = family;
  6816. + newsk->sk_prot = prot;
  6817. + newsk->sk_prot_creator = prot;
  6818. + inet_csk(newsk)->icsk_af_ops = af_ops;
  6819. +
  6820. + /* We don't yet have the mptcp-point. Thus we still need inet_sock_destruct */
  6821. + newsk->sk_destruct = inet_sock_destruct;
  6822. +
  6823. + /* SANITY */
  6824. + get_net(sock_net(newsk));
  6825. + sk_node_init(&newsk->sk_node);
  6826. + sock_lock_init_class_and_name(newsk, "slock-AF_INET-MPTCP",
  6827. + &meta_slock_key, "sk_lock-AF_INET-MPTCP",
  6828. + &meta_key);
  6829. +
  6830. + /* Unlocks are in:
  6831. + *
  6832. + * 1. If we are creating the master-sk
  6833. + * * on client-side in tcp_rcv_state_process, "case TCP_SYN_SENT"
  6834. + * * on server-side in tcp_child_process
  6835. + * 2. If we are creating another subsock
  6836. + * * Also in tcp_child_process
  6837. + */
  6838. + bh_lock_sock(newsk);
  6839. + newsk->sk_backlog.head = NULL;
  6840. + newsk->sk_backlog.tail = NULL;
  6841. + newsk->sk_backlog.len = 0;
  6842. +
  6843. + atomic_set(&newsk->sk_rmem_alloc, 0);
  6844. + atomic_set(&newsk->sk_wmem_alloc, 1);
  6845. + atomic_set(&newsk->sk_omem_alloc, 0);
  6846. +
  6847. + skb_queue_head_init(&newsk->sk_receive_queue);
  6848. + skb_queue_head_init(&newsk->sk_write_queue);
  6849. +#ifdef CONFIG_NET_DMA
  6850. + skb_queue_head_init(&newsk->sk_async_wait_queue);
  6851. +#endif
  6852. +
  6853. + spin_lock_init(&newsk->sk_dst_lock);
  6854. + rwlock_init(&newsk->sk_callback_lock);
  6855. + lockdep_set_class_and_name(&newsk->sk_callback_lock,
  6856. + af_callback_keys + newsk->sk_family,
  6857. + af_family_clock_key_strings[newsk->sk_family]);
  6858. + newsk->sk_dst_cache = NULL;
  6859. + newsk->sk_rx_dst = NULL;
  6860. + newsk->sk_wmem_queued = 0;
  6861. + newsk->sk_forward_alloc = 0;
  6862. + newsk->sk_send_head = NULL;
  6863. + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
  6864. +
  6865. + tcp_sk(newsk)->mptcp = NULL;
  6866. +
  6867. + sock_reset_flag(newsk, SOCK_DONE);
  6868. + skb_queue_head_init(&newsk->sk_error_queue);
  6869. +
  6870. + filter = rcu_dereference_protected(newsk->sk_filter, 1);
  6871. + if (filter != NULL)
  6872. + sk_filter_charge(newsk, filter);
  6873. +
  6874. + if (unlikely(xfrm_sk_clone_policy(newsk))) {
  6875. + /* It is still raw copy of parent, so invalidate
  6876. + * destructor and make plain sk_free()
  6877. + */
  6878. + newsk->sk_destruct = NULL;
  6879. + bh_unlock_sock(newsk);
  6880. + sk_free(newsk);
  6881. + newsk = NULL;
  6882. + return -ENOMEM;
  6883. + }
  6884. +
  6885. + newsk->sk_err = 0;
  6886. + newsk->sk_priority = 0;
  6887. + /* Before updating sk_refcnt, we must commit prior changes to memory
  6888. + * (Documentation/RCU/rculist_nulls.txt for details)
  6889. + */
  6890. + smp_wmb();
  6891. + atomic_set(&newsk->sk_refcnt, 2);
  6892. +
  6893. + /* Increment the counter in the same struct proto as the master
  6894. + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
  6895. + * is the same as sk->sk_prot->socks, as this field was copied
  6896. + * with memcpy).
  6897. + *
  6898. + * This _changes_ the previous behaviour, where
  6899. + * tcp_create_openreq_child always was incrementing the
  6900. + * equivalent to tcp_prot->socks (inet_sock_nr), so this have
  6901. + * to be taken into account in all callers. -acme
  6902. + */
  6903. + sk_refcnt_debug_inc(newsk);
  6904. + sk_set_socket(newsk, NULL);
  6905. + newsk->sk_wq = NULL;
  6906. +
  6907. + if (newsk->sk_prot->sockets_allocated)
  6908. + percpu_counter_inc(newsk->sk_prot->sockets_allocated);
  6909. +
  6910. + if (sock_flag(newsk, SOCK_TIMESTAMP) ||
  6911. + sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
  6912. + net_enable_timestamp();
  6913. +
  6914. + return 0;
  6915. +}
  6916. +
  6917. +int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)
  6918. +{
  6919. + struct mptcp_cb *mpcb;
  6920. + struct sock *master_sk;
  6921. + struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);
  6922. + struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
  6923. + struct sk_buff *skb, *tmp;
  6924. + u64 idsn;
  6925. +
  6926. + master_sk = sk_prot_alloc(meta_sk->sk_prot, GFP_ATOMIC | __GFP_ZERO,
  6927. + meta_sk->sk_family);
  6928. + if (!master_sk)
  6929. + return -ENOBUFS;
  6930. +
  6931. + master_tp = tcp_sk(master_sk);
  6932. + master_icsk = inet_csk(master_sk);
  6933. +
  6934. + /* Need to set this here - it is needed by mptcp_inherit_sk */
  6935. + master_sk->sk_prot = meta_sk->sk_prot;
  6936. + master_sk->sk_prot_creator = meta_sk->sk_prot;
  6937. + master_icsk->icsk_af_ops = meta_icsk->icsk_af_ops;
  6938. +
  6939. + mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
  6940. + if (!mpcb) {
  6941. + sk_free(master_sk);
  6942. + return -ENOBUFS;
  6943. + }
  6944. +
  6945. + /* master_sk inherits from meta_sk */
  6946. + if (mptcp_inherit_sk(meta_sk, master_sk, meta_sk->sk_family, GFP_ATOMIC)) {
  6947. + kmem_cache_free(mptcp_cb_cache, mpcb);
  6948. + return -ENOBUFS;
  6949. + }
  6950. +
  6951. +#if IS_ENABLED(CONFIG_IPV6)
  6952. + if (meta_icsk->icsk_af_ops == &ipv6_mapped) {
  6953. + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
  6954. +
  6955. + inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
  6956. +
  6957. + newnp = inet6_sk(master_sk);
  6958. + memcpy(newnp, np, sizeof(struct ipv6_pinfo));
  6959. +
  6960. + newnp->ipv6_mc_list = NULL;
  6961. + newnp->ipv6_ac_list = NULL;
  6962. + newnp->ipv6_fl_list = NULL;
  6963. + newnp->opt = NULL;
  6964. + newnp->pktoptions = NULL;
  6965. + (void)xchg(&newnp->rxpmtu, NULL);
  6966. + } else if (meta_sk->sk_family == AF_INET6) {
  6967. + struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
  6968. +
  6969. + inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
  6970. +
  6971. + newnp = inet6_sk(master_sk);
  6972. + memcpy(newnp, np, sizeof(struct ipv6_pinfo));
  6973. +
  6974. + newnp->hop_limit = -1;
  6975. + newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
  6976. + newnp->mc_loop = 1;
  6977. + newnp->pmtudisc = IPV6_PMTUDISC_WANT;
  6978. + newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only;
  6979. + }
  6980. +#endif
  6981. +
  6982. + meta_tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, GFP_ATOMIC);
  6983. + if (!meta_tp->mptcp) {
  6984. + kmem_cache_free(mptcp_cb_cache, mpcb);
  6985. + sk_free(master_sk);
  6986. + return -ENOBUFS;
  6987. + }
  6988. +
  6989. + INIT_LIST_HEAD(&meta_tp->mptcp->cb_list);
  6990. +
  6991. + /* Store the keys and generate the peer's token */
  6992. + mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
  6993. + mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
  6994. +
  6995. + /* Generate Initial data-sequence-numbers */
  6996. + mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
  6997. + idsn = ntohll(idsn) + 1;
  6998. + mpcb->snd_high_order[0] = idsn >> 32;
  6999. + mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
  7000. +
  7001. + meta_tp->write_seq = (u32)idsn;
  7002. + meta_tp->snd_sml = meta_tp->write_seq;
  7003. + meta_tp->snd_una = meta_tp->write_seq;
  7004. + meta_tp->snd_nxt = meta_tp->write_seq;
  7005. + meta_tp->pushed_seq = meta_tp->write_seq;
  7006. + meta_tp->snd_up = meta_tp->write_seq;
  7007. +
  7008. + mpcb->mptcp_rem_key = remote_key;
  7009. + mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
  7010. + idsn = ntohll(idsn) + 1;
  7011. + mpcb->rcv_high_order[0] = idsn >> 32;
  7012. + mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
  7013. + meta_tp->copied_seq = (u32) idsn;
  7014. + meta_tp->rcv_nxt = (u32) idsn;
  7015. + meta_tp->rcv_wup = (u32) idsn;
  7016. +
  7017. + meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
  7018. + meta_tp->snd_wnd = window;
  7019. + meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
  7020. +
  7021. + meta_tp->packets_out = 0;
  7022. + meta_tp->mptcp->snt_isn = meta_tp->write_seq; /* Initial data-sequence-number */
  7023. + meta_icsk->icsk_probes_out = 0;
  7024. +
  7025. + /* Set mptcp-pointers */
  7026. + master_tp->mpcb = mpcb;
  7027. + master_tp->meta_sk = meta_sk;
  7028. + meta_tp->mpcb = mpcb;
  7029. + meta_tp->meta_sk = meta_sk;
  7030. + mpcb->meta_sk = meta_sk;
  7031. + mpcb->master_sk = master_sk;
  7032. +
  7033. + set_mpc(meta_tp);
  7034. + meta_tp->mptcp->attached = 0;
  7035. + meta_tp->was_meta_sk = 0;
  7036. +
  7037. + /* Initialize the queues */
  7038. + skb_queue_head_init(&mpcb->reinject_queue);
  7039. + skb_queue_head_init(&master_tp->out_of_order_queue);
  7040. + tcp_prequeue_init(master_tp);
  7041. + INIT_LIST_HEAD(&master_tp->tsq_node);
  7042. +
  7043. + master_tp->tsq_flags = 0;
  7044. +
  7045. + /* Copy the write-queue from the meta down to the master.
  7046. + * This is necessary to get the SYN to the master-write-queue.
  7047. + * No other data can be queued, before tcp_sendmsg waits for the
  7048. + * connection to finish.
  7049. + */
  7050. + skb_queue_walk_safe(&meta_sk->sk_write_queue, skb, tmp) {
  7051. + skb_unlink(skb, &meta_sk->sk_write_queue);
  7052. + skb_queue_tail(&master_sk->sk_write_queue, skb);
  7053. +
  7054. + master_sk->sk_wmem_queued += skb->truesize;
  7055. + sk_mem_charge(master_sk, skb->truesize);
  7056. + }
  7057. +
  7058. + meta_sk->sk_wmem_queued = 0;
  7059. + meta_sk->sk_forward_alloc = 0;
  7060. +
  7061. + mutex_init(&mpcb->mpcb_mutex);
  7062. +
  7063. + /* Init the accept_queue structure, we support a queue of 32 pending
  7064. + * connections, it does not need to be huge, since we only store here
  7065. + * pending subflow creations.
  7066. + */
  7067. + if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {
  7068. + inet_put_port(master_sk);
  7069. + kmem_cache_free(mptcp_sock_cache, meta_tp->mptcp);
  7070. + kmem_cache_free(mptcp_cb_cache, mpcb);
  7071. + sk_free(master_sk);
  7072. + reset_mpc(meta_tp);
  7073. + return -ENOMEM;
  7074. + }
  7075. +
  7076. + /* Redefine function-pointers as the meta-sk is now fully ready */
  7077. + meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
  7078. + meta_sk->sk_destruct = mptcp_sock_destruct;
  7079. + mpcb->syn_recv_sock = mptcp_syn_recv_sock;
  7080. +
  7081. + /* Meta-level retransmit timer */
  7082. + meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
  7083. +
  7084. + tcp_init_xmit_timers(master_sk);
  7085. + /* Has been set for sending out the SYN */
  7086. + inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
  7087. +
  7088. + if (!meta_tp->inside_tk_table) {
  7089. + /* Adding the meta_tp in the token hashtable - coming from server-side */
  7090. + rcu_read_lock();
  7091. + spin_lock(&mptcp_tk_hashlock);
  7092. +
  7093. + __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
  7094. +
  7095. + spin_unlock(&mptcp_tk_hashlock);
  7096. + rcu_read_unlock();
  7097. + }
  7098. + master_tp->inside_tk_table = 0;
  7099. +
  7100. + /* Init time-wait stuff */
  7101. + INIT_LIST_HEAD(&mpcb->tw_list);
  7102. + spin_lock_init(&mpcb->tw_lock);
  7103. +
  7104. + INIT_LIST_HEAD(&mpcb->callback_list);
  7105. +
  7106. + mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
  7107. +
  7108. + mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
  7109. + mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
  7110. + mpcb->orig_window_clamp = meta_tp->window_clamp;
  7111. +
  7112. + /* The meta is directly linked - set refcnt to 1 */
  7113. + atomic_set(&mpcb->mpcb_refcnt, 1);
  7114. +
  7115. + mptcp_init_path_manager(mpcb);
  7116. +
  7117. + mptcp_debug("%s: created mpcb with token %#x\n",
  7118. + __func__, mpcb->mptcp_loc_token);
  7119. +
  7120. + return 0;
  7121. +}
  7122. +
  7123. +struct sock *mptcp_sk_clone(const struct sock *sk, int family,
  7124. + const gfp_t priority)
  7125. +{
  7126. + struct sock *newsk = NULL;
  7127. +
  7128. + if (family == AF_INET && sk->sk_family == AF_INET) {
  7129. + newsk = sk_prot_alloc(&tcp_prot, priority, family);
  7130. + if (!newsk)
  7131. + return NULL;
  7132. +
  7133. + /* Set these pointers - they are needed by mptcp_inherit_sk */
  7134. + newsk->sk_prot = &tcp_prot;
  7135. + newsk->sk_prot_creator = &tcp_prot;
  7136. + inet_csk(newsk)->icsk_af_ops = &ipv4_specific;
  7137. + newsk->sk_family = AF_INET;
  7138. + }
  7139. +#if IS_ENABLED(CONFIG_IPV6)
  7140. + else {
  7141. + newsk = sk_prot_alloc(&tcpv6_prot, priority, family);
  7142. + if (!newsk)
  7143. + return NULL;
  7144. +
  7145. + newsk->sk_prot = &tcpv6_prot;
  7146. + newsk->sk_prot_creator = &tcpv6_prot;
  7147. + if (family == AF_INET)
  7148. + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
  7149. + else
  7150. + inet_csk(newsk)->icsk_af_ops = &ipv6_specific;
  7151. + newsk->sk_family = AF_INET6;
  7152. + }
  7153. +#endif
  7154. +
  7155. + if (mptcp_inherit_sk(sk, newsk, family, priority))
  7156. + return NULL;
  7157. +
  7158. + return newsk;
  7159. +}
  7160. +
  7161. +void mptcp_fallback_meta_sk(struct sock *meta_sk)
  7162. +{
  7163. + kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);
  7164. + kmem_cache_free(mptcp_sock_cache, tcp_sk(meta_sk)->mptcp);
  7165. + kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
  7166. +}
  7167. +
  7168. +int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
  7169. + gfp_t flags)
  7170. +{
  7171. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  7172. + struct tcp_sock *tp = tcp_sk(sk);
  7173. +
  7174. + tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
  7175. + if (!tp->mptcp)
  7176. + return -ENOMEM;
  7177. +
  7178. + tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
  7179. + /* No more space for more subflows? */
  7180. + if (!tp->mptcp->path_index) {
  7181. + kmem_cache_free(mptcp_sock_cache, tp->mptcp);
  7182. + return -EPERM;
  7183. + }
  7184. +
  7185. + INIT_LIST_HEAD(&tp->mptcp->cb_list);
  7186. +
  7187. + tp->mptcp->tp = tp;
  7188. + tp->mpcb = mpcb;
  7189. + tp->meta_sk = meta_sk;
  7190. + set_mpc(tp);
  7191. + tp->mptcp->loc_id = loc_id;
  7192. + tp->mptcp->rem_id = rem_id;
  7193. + tp->mptcp->last_rbuf_opti = tcp_time_stamp;
  7194. +
  7195. + /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
  7196. + * included in mptcp_del_sock(), because the mpcb must remain alive
  7197. + * until the last subsocket is completely destroyed.
  7198. + */
  7199. + sock_hold(meta_sk);
  7200. + atomic_inc(&mpcb->mpcb_refcnt);
  7201. +
  7202. + tp->mptcp->next = mpcb->connection_list;
  7203. + mpcb->connection_list = tp;
  7204. + tp->mptcp->attached = 1;
  7205. +
  7206. + mpcb->cnt_subflows++;
  7207. + atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
  7208. + &meta_sk->sk_rmem_alloc);
  7209. +
  7210. + mptcp_sub_inherit_sockopts(meta_sk, sk);
  7211. + INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
  7212. +
  7213. + /* As we successfully allocated the mptcp_tcp_sock, we have to
  7214. + * change the function-pointers here (for sk_destruct to work correctly)
  7215. + */
  7216. + sk->sk_error_report = mptcp_sock_def_error_report;
  7217. + sk->sk_data_ready = mptcp_data_ready;
  7218. + sk->sk_write_space = mptcp_write_space;
  7219. + sk->sk_state_change = mptcp_set_state;
  7220. + sk->sk_destruct = mptcp_sock_destruct;
  7221. +
  7222. + if (sk->sk_family == AF_INET)
  7223. + mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
  7224. + __func__ , mpcb->mptcp_loc_token,
  7225. + tp->mptcp->path_index,
  7226. + &((struct inet_sock *)tp)->inet_saddr,
  7227. + ntohs(((struct inet_sock *)tp)->inet_sport),
  7228. + &((struct inet_sock *)tp)->inet_daddr,
  7229. + ntohs(((struct inet_sock *)tp)->inet_dport),
  7230. + mpcb->cnt_subflows);
  7231. +#if IS_ENABLED(CONFIG_IPV6)
  7232. + else
  7233. + mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
  7234. + __func__ , mpcb->mptcp_loc_token,
  7235. + tp->mptcp->path_index, &inet6_sk(sk)->saddr,
  7236. + ntohs(((struct inet_sock *)tp)->inet_sport),
  7237. + &sk->sk_v6_daddr,
  7238. + ntohs(((struct inet_sock *)tp)->inet_dport),
  7239. + mpcb->cnt_subflows);
  7240. +#endif
  7241. +
  7242. + return 0;
  7243. +}
  7244. +
  7245. +void mptcp_del_sock(struct sock *sk)
  7246. +{
  7247. + struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
  7248. + struct mptcp_cb *mpcb;
  7249. +
  7250. + if (!tp->mptcp || !tp->mptcp->attached)
  7251. + return;
  7252. +
  7253. + mpcb = tp->mpcb;
  7254. + tp_prev = mpcb->connection_list;
  7255. +
  7256. + mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
  7257. + __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
  7258. + sk->sk_state, is_meta_sk(sk));
  7259. +
  7260. + if (tp_prev == tp) {
  7261. + mpcb->connection_list = tp->mptcp->next;
  7262. + } else {
  7263. + for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
  7264. + if (tp_prev->mptcp->next == tp) {
  7265. + tp_prev->mptcp->next = tp->mptcp->next;
  7266. + break;
  7267. + }
  7268. + }
  7269. + }
  7270. + mpcb->cnt_subflows--;
  7271. + if (tp->mptcp->establish_increased)
  7272. + mpcb->cnt_established--;
  7273. +
  7274. + tp->mptcp->next = NULL;
  7275. + tp->mptcp->attached = 0;
  7276. + mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
  7277. +
  7278. + if (!skb_queue_empty(&sk->sk_write_queue))
  7279. + mptcp_reinject_data(sk, 0);
  7280. +
  7281. + if (is_master_tp(tp))
  7282. + mpcb->master_sk = NULL;
  7283. + else if (tp->mptcp->pre_established)
  7284. + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
  7285. +
  7286. + rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
  7287. +}
  7288. +
  7289. +/* Updates the metasocket ULID/port data, based on the given sock.
  7290. + * The argument sock must be the sock accessible to the application.
  7291. + * In this function, we update the meta socket info, based on the changes
  7292. + * in the application socket (bind, address allocation, ...)
  7293. + */
  7294. +void mptcp_update_metasocket(struct sock *sk, struct sock *meta_sk)
  7295. +{
  7296. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  7297. + union inet_addr addr;
  7298. + int index;
  7299. +
  7300. + /* Get the index of the local address */
  7301. + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
  7302. + addr.ip = inet_sk(sk)->inet_saddr;
  7303. + index = mpcb->pm_ops->get_local_index(AF_INET, &addr, sock_net(meta_sk));
  7304. + } else {
  7305. + addr.in6 = inet6_sk(sk)->saddr;
  7306. + index = mpcb->pm_ops->get_local_index(AF_INET6, &addr, sock_net(meta_sk));
  7307. + }
  7308. +
  7309. + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
  7310. + mptcp_v4_add_raddress(mpcb,
  7311. + (struct in_addr *)&inet_sk(sk)->inet_daddr,
  7312. + 0, 0);
  7313. + if (index >= 0)
  7314. + mptcp_v4_set_init_addr_bit(mpcb, inet_sk(sk)->inet_daddr, index);
  7315. + } else {
  7316. +#if IS_ENABLED(CONFIG_IPV6)
  7317. + mptcp_v6_add_raddress(mpcb, &sk->sk_v6_daddr, 0, 0);
  7318. + if (index >= 0)
  7319. + mptcp_v6_set_init_addr_bit(mpcb, &sk->sk_v6_daddr, index);
  7320. +#endif
  7321. + }
  7322. +
  7323. + if (mpcb->pm_ops->new_session)
  7324. + mpcb->pm_ops->new_session(meta_sk, index);
  7325. +
  7326. + tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;
  7327. +}
  7328. +
  7329. +/* Clean up the receive buffer for full frames taken by the user,
  7330. + * then send an ACK if necessary. COPIED is the number of bytes
  7331. + * tcp_recvmsg has given to the user so far, it speeds up the
  7332. + * calculation of whether or not we must ACK for the sake of
  7333. + * a window update.
  7334. + */
  7335. +void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
  7336. +{
  7337. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  7338. + struct sock *sk;
  7339. + __u32 rcv_window_now = 0;
  7340. +
  7341. + if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
  7342. + rcv_window_now = tcp_receive_window(meta_tp);
  7343. +
  7344. + if (2 * rcv_window_now > meta_tp->window_clamp)
  7345. + rcv_window_now = 0;
  7346. + }
  7347. +
  7348. + mptcp_for_each_sk(meta_tp->mpcb, sk) {
  7349. + struct tcp_sock *tp = tcp_sk(sk);
  7350. + const struct inet_connection_sock *icsk = inet_csk(sk);
  7351. +
  7352. + if (!mptcp_sk_can_send_ack(sk))
  7353. + continue;
  7354. +
  7355. + if (!inet_csk_ack_scheduled(sk))
  7356. + goto second_part;
  7357. + /* Delayed ACKs frequently hit locked sockets during bulk
  7358. + * receive.
  7359. + */
  7360. + if (icsk->icsk_ack.blocked ||
  7361. + /* Once-per-two-segments ACK was not sent by tcp_input.c */
  7362. + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
  7363. + /* If this read emptied read buffer, we send ACK, if
  7364. + * connection is not bidirectional, user drained
  7365. + * receive buffer and there was a small segment
  7366. + * in queue.
  7367. + */
  7368. + (copied > 0 &&
  7369. + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
  7370. + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
  7371. + !icsk->icsk_ack.pingpong)) &&
  7372. + !atomic_read(&meta_sk->sk_rmem_alloc))) {
  7373. + tcp_send_ack(sk);
  7374. + continue;
  7375. + }
  7376. +
  7377. +second_part:
  7378. + /* This here is the second part of tcp_cleanup_rbuf */
  7379. + if (rcv_window_now) {
  7380. + __u32 new_window = tp->__select_window(sk);
  7381. +
  7382. + /* Send ACK now, if this read freed lots of space
  7383. + * in our buffer. Certainly, new_window is new window.
  7384. + * We can advertise it now, if it is not less than
  7385. + * current one.
  7386. + * "Lots" means "at least twice" here.
  7387. + */
  7388. + if (new_window && new_window >= 2 * rcv_window_now)
  7389. + tcp_send_ack(sk);
  7390. + }
  7391. + }
  7392. +}
  7393. +
  7394. +static int mptcp_sub_send_fin(struct sock *sk)
  7395. +{
  7396. + struct tcp_sock *tp = tcp_sk(sk);
  7397. + struct sk_buff *skb = tcp_write_queue_tail(sk);
  7398. + int mss_now;
  7399. +
  7400. + /* Optimization, tack on the FIN if we have a queue of
  7401. + * unsent frames. But be careful about outgoing SACKS
  7402. + * and IP options.
  7403. + */
  7404. + mss_now = tcp_current_mss(sk);
  7405. +
  7406. + if (tcp_send_head(sk) != NULL) {
  7407. + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
  7408. + TCP_SKB_CB(skb)->end_seq++;
  7409. + tp->write_seq++;
  7410. + } else {
  7411. + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
  7412. + if (!skb)
  7413. + return 1;
  7414. +
  7415. + /* Reserve space for headers and prepare control bits. */
  7416. + skb_reserve(skb, MAX_TCP_HEADER);
  7417. + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
  7418. + tcp_init_nondata_skb(skb, tp->write_seq,
  7419. + TCPHDR_ACK | TCPHDR_FIN);
  7420. + tcp_queue_skb(sk, skb);
  7421. + }
  7422. + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
  7423. +
  7424. + return 0;
  7425. +}
  7426. +
  7427. +void mptcp_sub_close_wq(struct work_struct *work)
  7428. +{
  7429. + struct mptcp_tcp_sock *mptcp = container_of(work, struct mptcp_tcp_sock, work.work);
  7430. + struct tcp_sock *tp = mptcp->tp;
  7431. + struct sock *sk = (struct sock *)tp;
  7432. + struct sock *meta_sk = mptcp_meta_sk(sk);
  7433. +
  7434. + mutex_lock(&tp->mpcb->mpcb_mutex);
  7435. + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
  7436. +
  7437. + if (sock_flag(sk, SOCK_DEAD))
  7438. + goto exit;
  7439. +
  7440. + /* We come from tcp_disconnect. We are sure that meta_sk is set */
  7441. + if (!tp->mpc) {
  7442. + tp->closing = 1;
  7443. + sock_rps_reset_flow(sk);
  7444. + tcp_close(sk, 0);
  7445. + goto exit;
  7446. + }
  7447. +
  7448. + if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
  7449. + tp->closing = 1;
  7450. + sock_rps_reset_flow(sk);
  7451. + tcp_close(sk, 0);
  7452. + } else if (tcp_close_state(sk)) {
  7453. + sk->sk_shutdown |= SEND_SHUTDOWN;
  7454. + tcp_send_fin(sk);
  7455. + }
  7456. +
  7457. +exit:
  7458. + release_sock(meta_sk);
  7459. + mutex_unlock(&tp->mpcb->mpcb_mutex);
  7460. + sock_put(sk);
  7461. +}
  7462. +
  7463. +void mptcp_sub_close(struct sock *sk, unsigned long delay)
  7464. +{
  7465. + struct tcp_sock *tp = tcp_sk(sk);
  7466. + struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
  7467. +
  7468. + /* We are already closing - e.g., call from sock_def_error_report upon
  7469. + * tcp_disconnect in tcp_close.
  7470. + */
  7471. + if (tp->closing)
  7472. + return;
  7473. +
  7474. + /* Work already scheduled ? */
  7475. + if (work_pending(&work->work)) {
  7476. + /* Work present - who will be first ? */
  7477. + if (jiffies + delay > work->timer.expires)
  7478. + return;
  7479. +
  7480. + /* Try canceling - if it fails, work will be executed soon */
  7481. + if (!cancel_delayed_work(work))
  7482. + return;
  7483. + sock_put(sk);
  7484. + }
  7485. +
  7486. + if (!delay) {
  7487. + unsigned char old_state = sk->sk_state;
  7488. +
  7489. + /* If we are in user-context we can directly do the closing
  7490. + * procedure. No need to schedule a work-queue.
  7491. + */
  7492. + if (!in_softirq()) {
  7493. + if (sock_flag(sk, SOCK_DEAD))
  7494. + return;
  7495. +
  7496. + if (!tp->mpc) {
  7497. + tp->closing = 1;
  7498. + sock_rps_reset_flow(sk);
  7499. + tcp_close(sk, 0);
  7500. + return;
  7501. + }
  7502. +
  7503. + if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||
  7504. + sk->sk_state == TCP_CLOSE) {
  7505. + tp->closing = 1;
  7506. + sock_rps_reset_flow(sk);
  7507. + tcp_close(sk, 0);
  7508. + } else if (tcp_close_state(sk)) {
  7509. + sk->sk_shutdown |= SEND_SHUTDOWN;
  7510. + tcp_send_fin(sk);
  7511. + }
  7512. +
  7513. + return;
  7514. + }
  7515. +
  7516. + /* We directly send the FIN. Because it may take so a long time,
  7517. + * untile the work-queue will get scheduled...
  7518. + *
  7519. + * If mptcp_sub_send_fin returns 1, it failed and thus we reset
  7520. + * the old state so that tcp_close will finally send the fin
  7521. + * in user-context.
  7522. + */
  7523. + if (!sk->sk_err && old_state != TCP_CLOSE &&
  7524. + tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
  7525. + if (old_state == TCP_ESTABLISHED)
  7526. + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
  7527. + sk->sk_state = old_state;
  7528. + }
  7529. + }
  7530. +
  7531. + sock_hold(sk);
  7532. + queue_delayed_work(mptcp_wq, work, delay);
  7533. +}
  7534. +
  7535. +void mptcp_sub_force_close(struct sock *sk)
  7536. +{
  7537. + /* The below tcp_done may have freed the socket, if he is already dead.
  7538. + * Thus, we are not allowed to access it afterwards. That's why
  7539. + * we have to store the dead-state in this local variable.
  7540. + */
  7541. + int sock_is_dead = sock_flag(sk, SOCK_DEAD);
  7542. +
  7543. + tcp_sk(sk)->mp_killed = 1;
  7544. +
  7545. + if (sk->sk_state != TCP_CLOSE)
  7546. + tcp_done(sk);
  7547. +
  7548. + if (!sock_is_dead)
  7549. + mptcp_sub_close(sk, 0);
  7550. +}
  7551. +EXPORT_SYMBOL(mptcp_sub_force_close);
  7552. +
  7553. +/* Update the mpcb send window, based on the contributions
  7554. + * of each subflow
  7555. + */
  7556. +void mptcp_update_sndbuf(struct mptcp_cb *mpcb)
  7557. +{
  7558. + struct sock *meta_sk = mpcb->meta_sk, *sk;
  7559. + int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
  7560. + mptcp_for_each_sk(mpcb, sk) {
  7561. + if (!mptcp_sk_can_send(sk))
  7562. + continue;
  7563. +
  7564. + new_sndbuf += sk->sk_sndbuf;
  7565. +
  7566. + if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
  7567. + new_sndbuf = sysctl_tcp_wmem[2];
  7568. + break;
  7569. + }
  7570. + }
  7571. + meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
  7572. +
  7573. + /* The subflow's call to sk_write_space in tcp_new_space ends up in
  7574. + * mptcp_write_space.
  7575. + * It has nothing to do with waking up the application.
  7576. + * So, we do it here.
  7577. + */
  7578. + if (old_sndbuf != meta_sk->sk_sndbuf)
  7579. + meta_sk->sk_write_space(meta_sk);
  7580. +}
  7581. +
  7582. +void mptcp_close(struct sock *meta_sk, long timeout)
  7583. +{
  7584. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  7585. + struct sock *sk_it, *tmpsk;
  7586. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  7587. + struct sk_buff *skb;
  7588. + int data_was_unread = 0;
  7589. + int state;
  7590. +
  7591. + mptcp_debug("%s: Close of meta_sk with tok %#x\n",
  7592. + __func__, mpcb->mptcp_loc_token);
  7593. +
  7594. + mutex_lock(&mpcb->mpcb_mutex);
  7595. + lock_sock(meta_sk);
  7596. +
  7597. + if (meta_tp->inside_tk_table) {
  7598. + /* Detach the mpcb from the token hashtable */
  7599. + mptcp_hash_remove_bh(meta_tp);
  7600. + reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);
  7601. + }
  7602. +
  7603. + meta_sk->sk_shutdown = SHUTDOWN_MASK;
  7604. + /* We need to flush the recv. buffs. We do this only on the
  7605. + * descriptor close, not protocol-sourced closes, because the
  7606. + * reader process may not have drained the data yet!
  7607. + */
  7608. + while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
  7609. + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
  7610. + tcp_hdr(skb)->fin;
  7611. + data_was_unread += len;
  7612. + __kfree_skb(skb);
  7613. + }
  7614. +
  7615. + sk_mem_reclaim(meta_sk);
  7616. +
  7617. + /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
  7618. + if (meta_sk->sk_state == TCP_CLOSE) {
  7619. + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
  7620. + if (tcp_sk(sk_it)->send_mp_fclose)
  7621. + continue;
  7622. + mptcp_sub_close(sk_it, 0);
  7623. + }
  7624. + goto adjudge_to_death;
  7625. + }
  7626. +
  7627. + if (data_was_unread) {
  7628. + /* Unread data was tossed, zap the connection. */
  7629. + NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
  7630. + tcp_set_state(meta_sk, TCP_CLOSE);
  7631. + tcp_send_active_reset(meta_sk, meta_sk->sk_allocation);
  7632. + } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
  7633. + /* Check zero linger _after_ checking for unread data. */
  7634. + meta_sk->sk_prot->disconnect(meta_sk, 0);
  7635. + NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
  7636. + } else if (tcp_close_state(meta_sk)) {
  7637. + mptcp_send_fin(meta_sk);
  7638. + } else if (meta_tp->snd_una == meta_tp->write_seq) {
  7639. + /* The DATA_FIN has been sent and acknowledged
  7640. + * (e.g., by sk_shutdown). Close all the other subflows
  7641. + */
  7642. + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
  7643. + unsigned long delay = 0;
  7644. + /* If we are the passive closer, don't trigger
  7645. + * subflow-fin until the subflow has been finned
  7646. + * by the peer. - thus we add a delay
  7647. + */
  7648. + if (mpcb->passive_close &&
  7649. + sk_it->sk_state == TCP_ESTABLISHED)
  7650. + delay = inet_csk(sk_it)->icsk_rto << 3;
  7651. +
  7652. + mptcp_sub_close(sk_it, delay);
  7653. + }
  7654. + }
  7655. +
  7656. + sk_stream_wait_close(meta_sk, timeout);
  7657. +
  7658. +adjudge_to_death:
  7659. + state = meta_sk->sk_state;
  7660. + sock_hold(meta_sk);
  7661. + sock_orphan(meta_sk);
  7662. +
  7663. + /* socket will be freed after mptcp_close - we have to prevent
  7664. + * access from the subflows.
  7665. + */
  7666. + mptcp_for_each_sk(mpcb, sk_it) {
  7667. + /* Similar to sock_orphan, but we don't set it DEAD, because
  7668. + * the callbacks are still set and must be called.
  7669. + */
  7670. + write_lock_bh(&sk_it->sk_callback_lock);
  7671. + sk_set_socket(sk_it, NULL);
  7672. + sk_it->sk_wq = NULL;
  7673. + write_unlock_bh(&sk_it->sk_callback_lock);
  7674. + }
  7675. +
  7676. + /* It is the last release_sock in its life. It will remove backlog. */
  7677. + release_sock(meta_sk);
  7678. +
  7679. + /* Now socket is owned by kernel and we acquire BH lock
  7680. + * to finish close. No need to check for user refs.
  7681. + */
  7682. + local_bh_disable();
  7683. + bh_lock_sock(meta_sk);
  7684. + WARN_ON(sock_owned_by_user(meta_sk));
  7685. +
  7686. + percpu_counter_inc(meta_sk->sk_prot->orphan_count);
  7687. +
  7688. + /* Have we already been destroyed by a softirq or backlog? */
  7689. + if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
  7690. + goto out;
  7691. +
  7692. + /* This is a (useful) BSD violating of the RFC. There is a
  7693. + * problem with TCP as specified in that the other end could
  7694. + * keep a socket open forever with no application left this end.
  7695. + * We use a 3 minute timeout (about the same as BSD) then kill
  7696. + * our end. If they send after that then tough - BUT: long enough
  7697. + * that we won't make the old 4*rto = almost no time - whoops
  7698. + * reset mistake.
  7699. + *
  7700. + * Nope, it was not mistake. It is really desired behaviour
  7701. + * f.e. on http servers, when such sockets are useless, but
  7702. + * consume significant resources. Let's do it with special
  7703. + * linger2 option. --ANK
  7704. + */
  7705. +
  7706. + if (meta_sk->sk_state == TCP_FIN_WAIT2) {
  7707. + if (meta_tp->linger2 < 0) {
  7708. + tcp_set_state(meta_sk, TCP_CLOSE);
  7709. + tcp_send_active_reset(meta_sk, GFP_ATOMIC);
  7710. + NET_INC_STATS_BH(sock_net(meta_sk),
  7711. + LINUX_MIB_TCPABORTONLINGER);
  7712. + } else {
  7713. + const int tmo = tcp_fin_time(meta_sk);
  7714. +
  7715. + if (tmo > TCP_TIMEWAIT_LEN) {
  7716. + inet_csk_reset_keepalive_timer(meta_sk,
  7717. + tmo - TCP_TIMEWAIT_LEN);
  7718. + } else {
  7719. + tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
  7720. + goto out;
  7721. + }
  7722. + }
  7723. + }
  7724. + if (meta_sk->sk_state != TCP_CLOSE) {
  7725. + sk_mem_reclaim(meta_sk);
  7726. + if (tcp_too_many_orphans(meta_sk, 0)) {
  7727. + if (net_ratelimit())
  7728. + pr_info("MPTCP: too many of orphaned sockets\n");
  7729. + tcp_set_state(meta_sk, TCP_CLOSE);
  7730. + tcp_send_active_reset(meta_sk, GFP_ATOMIC);
  7731. + NET_INC_STATS_BH(sock_net(meta_sk),
  7732. + LINUX_MIB_TCPABORTONMEMORY);
  7733. + }
  7734. + }
  7735. +
  7736. +
  7737. + if (meta_sk->sk_state == TCP_CLOSE)
  7738. + inet_csk_destroy_sock(meta_sk);
  7739. + /* Otherwise, socket is reprieved until protocol close. */
  7740. +
  7741. +out:
  7742. + bh_unlock_sock(meta_sk);
  7743. + local_bh_enable();
  7744. + mutex_unlock(&mpcb->mpcb_mutex);
  7745. + sock_put(meta_sk); /* Taken by sock_hold */
  7746. +}
  7747. +
  7748. +void mptcp_disconnect(struct sock *sk)
  7749. +{
  7750. + struct sock *subsk, *tmpsk;
  7751. + struct tcp_sock *tp = tcp_sk(sk);
  7752. +
  7753. + __skb_queue_purge(&tp->mpcb->reinject_queue);
  7754. +
  7755. + if (tp->inside_tk_table) {
  7756. + mptcp_hash_remove_bh(tp);
  7757. + reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);
  7758. + }
  7759. +
  7760. + local_bh_disable();
  7761. + mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
  7762. + /* The socket will get removed from the subsocket-list
  7763. + * and made non-mptcp by setting mpc to 0.
  7764. + *
  7765. + * This is necessary, because tcp_disconnect assumes
  7766. + * that the connection is completly dead afterwards.
  7767. + * Thus we need to do a mptcp_del_sock. Due to this call
  7768. + * we have to make it non-mptcp.
  7769. + *
  7770. + * We have to lock the socket, because we set mpc to 0.
  7771. + * An incoming packet would take the subsocket's lock
  7772. + * and go on into the receive-path.
  7773. + * This would be a race.
  7774. + */
  7775. +
  7776. + bh_lock_sock(subsk);
  7777. + mptcp_del_sock(subsk);
  7778. + reset_mpc(tcp_sk(subsk));
  7779. + mptcp_sub_force_close(subsk);
  7780. + bh_unlock_sock(subsk);
  7781. + }
  7782. + local_bh_enable();
  7783. +
  7784. + tp->was_meta_sk = 1;
  7785. + reset_mpc(tp);
  7786. +}
  7787. +
  7788. +
  7789. +/* Returns 1 if we should enable MPTCP for that socket. */
  7790. +int mptcp_doit(struct sock *sk)
  7791. +{
  7792. + /* Do not allow MPTCP enabling if the MPTCP initialization failed */
  7793. + if (mptcp_init_failed)
  7794. + return 0;
  7795. +
  7796. + if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
  7797. + return 0;
  7798. +
  7799. + /* Socket may already be established (e.g., called from tcp_recvmsg) */
  7800. + if (tcp_sk(sk)->mpc || tcp_sk(sk)->request_mptcp)
  7801. + return 1;
  7802. +
  7803. + /* Don't do mptcp over loopback */
  7804. + if (sk->sk_family == AF_INET &&
  7805. + (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
  7806. + ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
  7807. + return 0;
  7808. +#if IS_ENABLED(CONFIG_IPV6)
  7809. + if (sk->sk_family == AF_INET6 &&
  7810. + (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
  7811. + ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
  7812. + return 0;
  7813. +#endif
  7814. + if (mptcp_v6_is_v4_mapped(sk) &&
  7815. + ipv4_is_loopback(inet_sk(sk)->inet_saddr))
  7816. + return 0;
  7817. +
  7818. +#ifdef CONFIG_TCP_MD5SIG
  7819. + /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
  7820. + if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
  7821. + return 0;
  7822. +#endif
  7823. +
  7824. + return 1;
  7825. +}
  7826. +
  7827. +int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)
  7828. +{
  7829. + struct tcp_sock *master_tp;
  7830. + struct sock *master_sk;
  7831. +
  7832. + if (mptcp_alloc_mpcb(meta_sk, remote_key, window))
  7833. + goto err_alloc_mpcb;
  7834. +
  7835. + master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
  7836. + master_tp = tcp_sk(master_sk);
  7837. +
  7838. + if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
  7839. + goto err_add_sock;
  7840. +
  7841. + if (__inet_inherit_port(meta_sk, master_sk) < 0)
  7842. + goto err_add_sock;
  7843. +
  7844. + meta_sk->sk_prot->unhash(meta_sk);
  7845. +
  7846. + if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))
  7847. + __inet_hash_nolisten(master_sk, NULL);
  7848. +#if IS_ENABLED(CONFIG_IPV6)
  7849. + else
  7850. + __inet6_hash(master_sk, NULL);
  7851. +#endif
  7852. +
  7853. + master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
  7854. +
  7855. + return 0;
  7856. +
  7857. +err_add_sock:
  7858. + mptcp_fallback_meta_sk(meta_sk);
  7859. +
  7860. + inet_csk_prepare_forced_close(master_sk);
  7861. + tcp_done(master_sk);
  7862. + inet_csk_prepare_forced_close(meta_sk);
  7863. + tcp_done(meta_sk);
  7864. +
  7865. +err_alloc_mpcb:
  7866. + return -ENOBUFS;
  7867. +}
  7868. +
  7869. +int mptcp_check_req_master(struct sock *sk, struct sock *child,
  7870. + struct request_sock *req,
  7871. + struct request_sock **prev,
  7872. + struct mptcp_options_received *mopt)
  7873. +{
  7874. + struct tcp_sock *child_tp = tcp_sk(child);
  7875. + struct sock *meta_sk = child;
  7876. + struct mptcp_cb *mpcb;
  7877. + struct mptcp_request_sock *mtreq;
  7878. +
  7879. + if (!tcp_rsk(req)->saw_mpc)
  7880. + return 1;
  7881. +
  7882. + /* Just set this values to pass them to mptcp_alloc_mpcb */
  7883. + mtreq = mptcp_rsk(req);
  7884. + child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
  7885. + child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
  7886. +
  7887. + if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
  7888. + child_tp->snd_wnd))
  7889. + return -ENOBUFS;
  7890. +
  7891. + child = tcp_sk(child)->mpcb->master_sk;
  7892. + child_tp = tcp_sk(child);
  7893. + mpcb = child_tp->mpcb;
  7894. +
  7895. + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
  7896. + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
  7897. +
  7898. + mpcb->dss_csum = mtreq->dss_csum;
  7899. + mpcb->server_side = 1;
  7900. +
  7901. + /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */
  7902. + mptcp_update_metasocket(child, meta_sk);
  7903. +
  7904. + /* Needs to be done here additionally, because when accepting a
  7905. + * new connection we pass by __reqsk_free and not reqsk_free.
  7906. + */
  7907. + mptcp_reqsk_remove_tk(req);
  7908. +
  7909. + /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
  7910. + sock_put(meta_sk);
  7911. +
  7912. + inet_csk_reqsk_queue_unlink(sk, req, prev);
  7913. + inet_csk_reqsk_queue_removed(sk, req);
  7914. + inet_csk_reqsk_queue_add(sk, req, meta_sk);
  7915. +
  7916. + return 0;
  7917. +}
  7918. +
  7919. +struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,
  7920. + struct request_sock *req,
  7921. + struct request_sock **prev,
  7922. + struct mptcp_options_received *mopt)
  7923. +{
  7924. + struct tcp_sock *child_tp = tcp_sk(child);
  7925. + struct mptcp_request_sock *mtreq = mptcp_rsk(req);
  7926. + struct mptcp_cb *mpcb = mtreq->mpcb;
  7927. + u8 hash_mac_check[20];
  7928. +
  7929. + child_tp->inside_tk_table = 0;
  7930. +
  7931. + if (!mopt->join_ack)
  7932. + goto teardown;
  7933. +
  7934. + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
  7935. + (u8 *)&mpcb->mptcp_loc_key,
  7936. + (u8 *)&mtreq->mptcp_rem_nonce,
  7937. + (u8 *)&mtreq->mptcp_loc_nonce,
  7938. + (u32 *)hash_mac_check);
  7939. +
  7940. + if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))
  7941. + goto teardown;
  7942. +
  7943. + /* Point it to the same struct socket and wq as the meta_sk */
  7944. + sk_set_socket(child, meta_sk->sk_socket);
  7945. + child->sk_wq = meta_sk->sk_wq;
  7946. +
  7947. + if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
  7948. + reset_mpc(child_tp); /* Has been inherited, but now
  7949. + * child_tp->mptcp is NULL
  7950. + */
  7951. + /* TODO when we support acking the third ack for new subflows,
  7952. + * we should silently discard this third ack, by returning NULL.
  7953. + *
  7954. + * Maybe, at the retransmission we will have enough memory to
  7955. + * fully add the socket to the meta-sk.
  7956. + */
  7957. + goto teardown;
  7958. + }
  7959. +
  7960. + /* The child is a clone of the meta socket, we must now reset
  7961. + * some of the fields
  7962. + */
  7963. + child_tp->mptcp->rcv_low_prio = mtreq->low_prio;
  7964. +
  7965. + /* We should allow proper increase of the snd/rcv-buffers. Thus, we
  7966. + * use the original values instead of the bloated up ones from the
  7967. + * clone.
  7968. + */
  7969. + child->sk_sndbuf = mpcb->orig_sk_sndbuf;
  7970. + child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
  7971. +
  7972. + child_tp->mptcp->slave_sk = 1;
  7973. + child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
  7974. + child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
  7975. + child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;
  7976. +
  7977. + child_tp->tsq_flags = 0;
  7978. +
  7979. + /* Subflows do not use the accept queue, as they
  7980. + * are attached immediately to the mpcb.
  7981. + */
  7982. + inet_csk_reqsk_queue_drop(meta_sk, req, prev);
  7983. + return child;
  7984. +
  7985. +teardown:
  7986. + /* Drop this request - sock creation failed. */
  7987. + inet_csk_reqsk_queue_drop(meta_sk, req, prev);
  7988. + inet_csk_prepare_forced_close(child);
  7989. + tcp_done(child);
  7990. + return meta_sk;
  7991. +}
  7992. +
  7993. +int mptcp_time_wait(struct sock *sk, struct tcp_timewait_sock *tw)
  7994. +{
  7995. + struct mptcp_tw *mptw;
  7996. + struct tcp_sock *tp = tcp_sk(sk);
  7997. + struct mptcp_cb *mpcb = tp->mpcb;
  7998. +
  7999. + /* Alloc MPTCP-tw-sock */
  8000. + mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
  8001. + if (!mptw)
  8002. + return -ENOBUFS;
  8003. +
  8004. + atomic_inc(&mpcb->mpcb_refcnt);
  8005. +
  8006. + tw->mptcp_tw = mptw;
  8007. + mptw->loc_key = mpcb->mptcp_loc_key;
  8008. + mptw->meta_tw = mpcb->in_time_wait;
  8009. + if (mptw->meta_tw) {
  8010. + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
  8011. + if (mpcb->mptw_state != TCP_TIME_WAIT)
  8012. + mptw->rcv_nxt++;
  8013. + }
  8014. + rcu_assign_pointer(mptw->mpcb, mpcb);
  8015. +
  8016. + spin_lock(&mpcb->tw_lock);
  8017. + list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
  8018. + mptw->in_list = 1;
  8019. + spin_unlock(&mpcb->tw_lock);
  8020. +
  8021. + return 0;
  8022. +}
  8023. +
  8024. +void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
  8025. +{
  8026. + struct mptcp_cb *mpcb;
  8027. +
  8028. + rcu_read_lock();
  8029. + mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
  8030. +
  8031. + /* If we are still holding a ref to the mpcb, we have to remove ourself
  8032. + * from the list and drop the ref properly.
  8033. + */
  8034. + if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {
  8035. + spin_lock(&mpcb->tw_lock);
  8036. + if (tw->mptcp_tw->in_list) {
  8037. + list_del_rcu(&tw->mptcp_tw->list);
  8038. + tw->mptcp_tw->in_list = 0;
  8039. + }
  8040. + spin_unlock(&mpcb->tw_lock);
  8041. +
  8042. + /* Twice, because we increased it above */
  8043. + mptcp_mpcb_put(mpcb);
  8044. + mptcp_mpcb_put(mpcb);
  8045. + }
  8046. +
  8047. + rcu_read_unlock();
  8048. +
  8049. + kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
  8050. +}
  8051. +
  8052. +/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
  8053. + * data-fin.
  8054. + */
  8055. +void mptcp_update_tw_socks(const struct tcp_sock *tp, int state)
  8056. +{
  8057. + struct mptcp_tw *mptw;
  8058. +
  8059. + /* Used for sockets that go into tw after the meta
  8060. + * (see mptcp_time_wait())
  8061. + */
  8062. + tp->mpcb->in_time_wait = 1;
  8063. + tp->mpcb->mptw_state = state;
  8064. +
  8065. + /* Update the time-wait-sock's information */
  8066. + rcu_read_lock_bh();
  8067. + list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {
  8068. + mptw->meta_tw = 1;
  8069. + mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);
  8070. +
  8071. + /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
  8072. + * pretend as if the DATA_FIN has already reached us, that way
  8073. + * the checks in tcp_timewait_state_process will be good as the
  8074. + * DATA_FIN comes in.
  8075. + */
  8076. + if (state != TCP_TIME_WAIT)
  8077. + mptw->rcv_nxt++;
  8078. + }
  8079. + rcu_read_unlock_bh();
  8080. +}
  8081. +
  8082. +void mptcp_tsq_flags(struct sock *sk)
  8083. +{
  8084. + struct tcp_sock *tp = tcp_sk(sk);
  8085. + struct sock *meta_sk = mptcp_meta_sk(sk);
  8086. +
  8087. + /* It will be handled as a regular deferred-call */
  8088. + if (is_meta_sk(sk))
  8089. + return;
  8090. +
  8091. + if (list_empty(&tp->mptcp->cb_list)) {
  8092. + list_add(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
  8093. + /* We need to hold it here, as the sock_hold is not assured
  8094. + * by the release_sock as it is done in regular TCP.
  8095. + *
  8096. + * The subsocket may get inet_csk_destroy'd while it is inside
  8097. + * the callback_list.
  8098. + */
  8099. + sock_hold(sk);
  8100. + }
  8101. +
  8102. + if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))
  8103. + sock_hold(meta_sk);
  8104. +}
  8105. +
  8106. +void mptcp_tsq_sub_deferred(struct sock *meta_sk)
  8107. +{
  8108. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  8109. + struct mptcp_tcp_sock *mptcp, *tmp;
  8110. +
  8111. + BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
  8112. +
  8113. + __sock_put(meta_sk);
  8114. + list_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
  8115. + struct tcp_sock *tp = mptcp->tp;
  8116. + struct sock *sk = (struct sock *)tp;
  8117. +
  8118. + list_del_init(&mptcp->cb_list);
  8119. + sk->sk_prot->release_cb(sk);
  8120. + /* Final sock_put (cfr. mptcp_tsq_flags */
  8121. + sock_put(sk);
  8122. + }
  8123. +}
  8124. +
  8125. +struct workqueue_struct *mptcp_wq;
  8126. +EXPORT_SYMBOL(mptcp_wq);
  8127. +
  8128. +/* Output /proc/net/mptcp */
  8129. +static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
  8130. +{
  8131. + struct tcp_sock *meta_tp;
  8132. + struct net *net = seq->private;
  8133. + int i, n = 0;
  8134. +
  8135. + seq_printf(seq, " sl loc_tok rem_tok v6 "
  8136. + "local_address "
  8137. + "remote_address "
  8138. + "st ns tx_queue rx_queue inode");
  8139. + seq_putc(seq, '\n');
  8140. +
  8141. + for (i = 0; i < MPTCP_HASH_SIZE; i++) {
  8142. + struct hlist_nulls_node *node;
  8143. + rcu_read_lock_bh();
  8144. + hlist_nulls_for_each_entry_rcu(meta_tp, node,
  8145. + &tk_hashtable[i], tk_table) {
  8146. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  8147. + struct sock *meta_sk = (struct sock *)meta_tp;
  8148. + struct inet_sock *isk = inet_sk(meta_sk);
  8149. +
  8150. + if (!meta_tp->mpc || !net_eq(net, sock_net(meta_sk)))
  8151. + continue;
  8152. +
  8153. + seq_printf(seq, "%4d: %04X %04X ", n++,
  8154. + mpcb->mptcp_loc_token,
  8155. + mpcb->mptcp_rem_token);
  8156. + if (meta_sk->sk_family == AF_INET ||
  8157. + mptcp_v6_is_v4_mapped(meta_sk)) {
  8158. + seq_printf(seq, " 0 %08X:%04X %08X:%04X ",
  8159. + isk->inet_saddr,
  8160. + ntohs(isk->inet_sport),
  8161. + isk->inet_daddr,
  8162. + ntohs(isk->inet_dport));
  8163. +#if IS_ENABLED(CONFIG_IPV6)
  8164. + } else if (meta_sk->sk_family == AF_INET6) {
  8165. + struct in6_addr *src = &isk->pinet6->saddr;
  8166. + struct in6_addr *dst = &meta_sk->sk_v6_daddr;
  8167. + seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
  8168. + src->s6_addr32[0], src->s6_addr32[1],
  8169. + src->s6_addr32[2], src->s6_addr32[3],
  8170. + ntohs(isk->inet_sport),
  8171. + dst->s6_addr32[0], dst->s6_addr32[1],
  8172. + dst->s6_addr32[2], dst->s6_addr32[3],
  8173. + ntohs(isk->inet_dport));
  8174. +#endif
  8175. + }
  8176. + seq_printf(seq, " %02X %02X %08X:%08X %lu",
  8177. + meta_sk->sk_state, mpcb->cnt_subflows,
  8178. + meta_tp->write_seq - meta_tp->snd_una,
  8179. + max_t(int, meta_tp->rcv_nxt -
  8180. + meta_tp->copied_seq, 0),
  8181. + sock_i_ino(meta_sk));
  8182. + seq_putc(seq, '\n');
  8183. + }
  8184. + rcu_read_unlock_bh();
  8185. + }
  8186. +
  8187. + return 0;
  8188. +}
  8189. +
  8190. +static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
  8191. +{
  8192. + return single_open_net(inode, file, mptcp_pm_seq_show);
  8193. +}
  8194. +
  8195. +static const struct file_operations mptcp_pm_seq_fops = {
  8196. + .owner = THIS_MODULE,
  8197. + .open = mptcp_pm_seq_open,
  8198. + .read = seq_read,
  8199. + .llseek = seq_lseek,
  8200. + .release = single_release_net,
  8201. +};
  8202. +
  8203. +static int mptcp_pm_init_net(struct net *net)
  8204. +{
  8205. + if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))
  8206. + return -ENOMEM;
  8207. +
  8208. + return 0;
  8209. +}
  8210. +
  8211. +static void mptcp_pm_exit_net(struct net *net)
  8212. +{
  8213. + remove_proc_entry("mptcp", net->proc_net);
  8214. +}
  8215. +
  8216. +static struct pernet_operations mptcp_pm_proc_ops = {
  8217. + .init = mptcp_pm_init_net,
  8218. + .exit = mptcp_pm_exit_net,
  8219. +};
  8220. +
  8221. +/* General initialization of mptcp */
  8222. +void __init mptcp_init(void)
  8223. +{
  8224. + int i;
  8225. + struct ctl_table_header *mptcp_sysctl;
  8226. +
  8227. + mptcp_sock_cache = kmem_cache_create("mptcp_sock",
  8228. + sizeof(struct mptcp_tcp_sock),
  8229. + 0, SLAB_HWCACHE_ALIGN,
  8230. + NULL);
  8231. + if (!mptcp_sock_cache)
  8232. + goto mptcp_sock_cache_failed;
  8233. +
  8234. + mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
  8235. + 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
  8236. + NULL);
  8237. + if (!mptcp_cb_cache)
  8238. + goto mptcp_cb_cache_failed;
  8239. +
  8240. + mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
  8241. + 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
  8242. + NULL);
  8243. + if (!mptcp_tw_cache)
  8244. + goto mptcp_tw_cache_failed;
  8245. +
  8246. + get_random_bytes(mptcp_secret, sizeof(mptcp_secret));
  8247. +
  8248. + mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
  8249. + if (!mptcp_wq)
  8250. + goto alloc_workqueue_failed;
  8251. +
  8252. + for (i = 0; i < MPTCP_HASH_SIZE; i++) {
  8253. + INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
  8254. + INIT_LIST_HEAD(&mptcp_reqsk_htb[i]);
  8255. + INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
  8256. + }
  8257. +
  8258. + spin_lock_init(&mptcp_reqsk_hlock);
  8259. + spin_lock_init(&mptcp_tk_hashlock);
  8260. +
  8261. + if (register_pernet_subsys(&mptcp_pm_proc_ops))
  8262. + goto pernet_failed;
  8263. +
  8264. +#if IS_ENABLED(CONFIG_IPV6)
  8265. + if (mptcp_pm_v6_init())
  8266. + goto mptcp_pm_v6_failed;
  8267. +#endif
  8268. + if (mptcp_pm_v4_init())
  8269. + goto mptcp_pm_v4_failed;
  8270. +
  8271. + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
  8272. + if (!mptcp_sysctl)
  8273. + goto register_sysctl_failed;
  8274. +
  8275. + if (mptcp_register_path_manager(&mptcp_pm_default))
  8276. + goto register_pm_failed;
  8277. +
  8278. + pr_info("MPTCP: Stable release v0.89.0-rc");
  8279. +
  8280. + mptcp_init_failed = false;
  8281. +
  8282. + return;
  8283. +
  8284. +register_pm_failed:
  8285. + unregister_net_sysctl_table(mptcp_sysctl);
  8286. +register_sysctl_failed:
  8287. + mptcp_pm_v4_undo();
  8288. +mptcp_pm_v4_failed:
  8289. +#if IS_ENABLED(CONFIG_IPV6)
  8290. + mptcp_pm_v6_undo();
  8291. +mptcp_pm_v6_failed:
  8292. +#endif
  8293. + unregister_pernet_subsys(&mptcp_pm_proc_ops);
  8294. +pernet_failed:
  8295. + destroy_workqueue(mptcp_wq);
  8296. +alloc_workqueue_failed:
  8297. + kmem_cache_destroy(mptcp_tw_cache);
  8298. +mptcp_tw_cache_failed:
  8299. + kmem_cache_destroy(mptcp_cb_cache);
  8300. +mptcp_cb_cache_failed:
  8301. + kmem_cache_destroy(mptcp_sock_cache);
  8302. +mptcp_sock_cache_failed:
  8303. + mptcp_init_failed = true;
  8304. +}
  8305. diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c
  8306. new file mode 100644
  8307. index 0000000..49bddf3
  8308. --- /dev/null
  8309. +++ b/net/mptcp/mptcp_fullmesh.c
  8310. @@ -0,0 +1,1313 @@
  8311. +#include <linux/module.h>
  8312. +
  8313. +#include <net/mptcp.h>
  8314. +#include <net/mptcp_v4.h>
  8315. +
  8316. +#if IS_ENABLED(CONFIG_IPV6)
  8317. +#include <net/mptcp_v6.h>
  8318. +#include <net/addrconf.h>
  8319. +#endif
  8320. +
  8321. +enum {
  8322. + MPTCP_EVENT_ADD = 1,
  8323. + MPTCP_EVENT_DEL,
  8324. + MPTCP_EVENT_MOD,
  8325. +};
  8326. +
  8327. +struct mptcp_loc_addr {
  8328. + struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
  8329. + u8 loc4_bits;
  8330. + u8 next_v4_index;
  8331. +
  8332. + struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
  8333. + u8 loc6_bits;
  8334. + u8 next_v6_index;
  8335. +};
  8336. +
  8337. +struct mptcp_addr_event {
  8338. + struct list_head list;
  8339. + unsigned short family;
  8340. + u8 code:7,
  8341. + low_prio:1;
  8342. + union inet_addr addr;
  8343. +};
  8344. +
  8345. +struct fullmesh_priv {
  8346. + /* Worker struct for subflow establishment */
  8347. + struct work_struct subflow_work;
  8348. + /* Delayed worker, when the routing-tables are not yet ready. */
  8349. + struct delayed_work subflow_retry_work;
  8350. +
  8351. + struct mptcp_cb *mpcb;
  8352. +
  8353. + u16 remove_addrs; /* Addresses to remove */
  8354. + u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
  8355. + u8 announced_addrs_v6; /* IPv4 Addresses we did announce */
  8356. +
  8357. + u8 add_addr; /* Are we sending an add_addr? */
  8358. +};
  8359. +
  8360. +struct mptcp_fm_ns {
  8361. + struct mptcp_loc_addr __rcu *local;
  8362. + spinlock_t local_lock; /* Protecting the above pointer */
  8363. + struct list_head events;
  8364. + struct delayed_work address_worker;
  8365. +
  8366. + struct net *net;
  8367. +};
  8368. +
  8369. +static struct mptcp_pm_ops full_mesh __read_mostly;
  8370. +
  8371. +static struct mptcp_fm_ns *fm_get_ns(struct net *net)
  8372. +{
  8373. + return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
  8374. +}
  8375. +
  8376. +static void full_mesh_create_subflows(struct sock *meta_sk);
  8377. +
  8378. +static void retry_subflow_worker(struct work_struct *work)
  8379. +{
  8380. + struct delayed_work *delayed_work = container_of(work,
  8381. + struct delayed_work,
  8382. + work);
  8383. + struct fullmesh_priv *pm_priv = container_of(delayed_work,
  8384. + struct fullmesh_priv,
  8385. + subflow_retry_work);
  8386. + struct mptcp_cb *mpcb = pm_priv->mpcb;
  8387. + struct sock *meta_sk = mpcb->meta_sk;
  8388. + struct mptcp_loc_addr *mptcp_local;
  8389. + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
  8390. + int iter = 0, i;
  8391. +
  8392. + /* We need a local (stable) copy of the address-list. Really, it is not
  8393. + * such a big deal, if the address-list is not 100% up-to-date.
  8394. + */
  8395. + rcu_read_lock_bh();
  8396. + mptcp_local = rcu_dereference_bh(fm_ns->local);
  8397. + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
  8398. + rcu_read_unlock_bh();
  8399. +
  8400. + if (!mptcp_local)
  8401. + return;
  8402. +
  8403. +next_subflow:
  8404. + if (iter) {
  8405. + release_sock(meta_sk);
  8406. + mutex_unlock(&mpcb->mpcb_mutex);
  8407. +
  8408. + yield();
  8409. + }
  8410. + mutex_lock(&mpcb->mpcb_mutex);
  8411. + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
  8412. +
  8413. + iter++;
  8414. +
  8415. + if (sock_flag(meta_sk, SOCK_DEAD))
  8416. + goto exit;
  8417. +
  8418. + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
  8419. + struct mptcp_rem4 *rem = &mpcb->remaddr4[i];
  8420. + /* Do we need to retry establishing a subflow ? */
  8421. + if (rem->retry_bitfield) {
  8422. + int i = mptcp_find_free_index(~rem->retry_bitfield);
  8423. +
  8424. + rem->bitfield |= (1 << i);
  8425. + rem->retry_bitfield &= ~(1 << i);
  8426. +
  8427. + mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], rem);
  8428. + goto next_subflow;
  8429. + }
  8430. + }
  8431. +
  8432. +#if IS_ENABLED(CONFIG_IPV6)
  8433. + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
  8434. + struct mptcp_rem6 *rem = &mpcb->remaddr6[i];
  8435. +
  8436. + /* Do we need to retry establishing a subflow ? */
  8437. + if (rem->retry_bitfield) {
  8438. + int i = mptcp_find_free_index(~rem->retry_bitfield);
  8439. +
  8440. + rem->bitfield |= (1 << i);
  8441. + rem->retry_bitfield &= ~(1 << i);
  8442. +
  8443. + mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], rem);
  8444. + goto next_subflow;
  8445. + }
  8446. + }
  8447. +#endif
  8448. +
  8449. +exit:
  8450. + kfree(mptcp_local);
  8451. + release_sock(meta_sk);
  8452. + mutex_unlock(&mpcb->mpcb_mutex);
  8453. + sock_put(meta_sk);
  8454. +}
  8455. +
  8456. +/**
  8457. + * Create all new subflows, by doing calls to mptcp_initX_subsockets
  8458. + *
  8459. + * This function uses a goto next_subflow, to allow releasing the lock between
  8460. + * new subflows and giving other processes a chance to do some work on the
  8461. + * socket and potentially finishing the communication.
  8462. + **/
  8463. +static void create_subflow_worker(struct work_struct *work)
  8464. +{
  8465. + struct fullmesh_priv *pm_priv = container_of(work,
  8466. + struct fullmesh_priv,
  8467. + subflow_work);
  8468. + struct mptcp_cb *mpcb = pm_priv->mpcb;
  8469. + struct sock *meta_sk = mpcb->meta_sk;
  8470. + struct mptcp_loc_addr *mptcp_local;
  8471. + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
  8472. + int iter = 0, retry = 0;
  8473. + int i;
  8474. +
  8475. + /* We need a local (stable) copy of the address-list. Really, it is not
  8476. + * such a big deal, if the address-list is not 100% up-to-date.
  8477. + */
  8478. + rcu_read_lock_bh();
  8479. + mptcp_local = rcu_dereference_bh(fm_ns->local);
  8480. + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
  8481. + rcu_read_unlock_bh();
  8482. +
  8483. + if (!mptcp_local)
  8484. + return;
  8485. +
  8486. +next_subflow:
  8487. + if (iter) {
  8488. + release_sock(meta_sk);
  8489. + mutex_unlock(&mpcb->mpcb_mutex);
  8490. +
  8491. + yield();
  8492. + }
  8493. + mutex_lock(&mpcb->mpcb_mutex);
  8494. + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
  8495. +
  8496. + iter++;
  8497. +
  8498. + if (sock_flag(meta_sk, SOCK_DEAD))
  8499. + goto exit;
  8500. +
  8501. + if (mpcb->master_sk &&
  8502. + !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
  8503. + goto exit;
  8504. +
  8505. + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
  8506. + struct mptcp_rem4 *rem;
  8507. + u8 remaining_bits;
  8508. +
  8509. + rem = &mpcb->remaddr4[i];
  8510. + remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
  8511. +
  8512. + /* Are there still combinations to handle? */
  8513. + if (remaining_bits) {
  8514. + int i = mptcp_find_free_index(~remaining_bits);
  8515. +
  8516. + rem->bitfield |= (1 << i);
  8517. +
  8518. + /* If a route is not yet available then retry once */
  8519. + if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
  8520. + rem) == -ENETUNREACH)
  8521. + retry = rem->retry_bitfield |= (1 << i);
  8522. + goto next_subflow;
  8523. + }
  8524. + }
  8525. +
  8526. +#if IS_ENABLED(CONFIG_IPV6)
  8527. + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
  8528. + struct mptcp_rem6 *rem;
  8529. + u8 remaining_bits;
  8530. +
  8531. + rem = &mpcb->remaddr6[i];
  8532. + remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
  8533. +
  8534. + /* Are there still combinations to handle? */
  8535. + if (remaining_bits) {
  8536. + int i = mptcp_find_free_index(~remaining_bits);
  8537. +
  8538. + rem->bitfield |= (1 << i);
  8539. +
  8540. + /* If a route is not yet available then retry once */
  8541. + if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
  8542. + rem) == -ENETUNREACH)
  8543. + retry = rem->retry_bitfield |= (1 << i);
  8544. + goto next_subflow;
  8545. + }
  8546. + }
  8547. +#endif
  8548. +
  8549. + if (retry && !delayed_work_pending(&pm_priv->subflow_retry_work)) {
  8550. + sock_hold(meta_sk);
  8551. + queue_delayed_work(mptcp_wq, &pm_priv->subflow_retry_work,
  8552. + msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
  8553. + }
  8554. +
  8555. +exit:
  8556. + kfree(mptcp_local);
  8557. + release_sock(meta_sk);
  8558. + mutex_unlock(&mpcb->mpcb_mutex);
  8559. + sock_put(meta_sk);
  8560. +}
  8561. +
  8562. +static void update_remove_addrs(u8 addr_id, struct sock *meta_sk,
  8563. + struct mptcp_loc_addr *mptcp_local)
  8564. +{
  8565. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  8566. + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
  8567. + struct sock *sk;
  8568. + int i;
  8569. +
  8570. + fmp->remove_addrs |= (1 << addr_id);
  8571. + /* v4 goes from 0 to MPTCP_MAX_ADDR, v6 beyond */
  8572. + if (addr_id < MPTCP_MAX_ADDR) {
  8573. + fmp->announced_addrs_v4 &= ~(1 << addr_id);
  8574. +
  8575. + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
  8576. + mpcb->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
  8577. + mpcb->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
  8578. + }
  8579. + } else {
  8580. + fmp->announced_addrs_v6 &= ~(1 << (addr_id - MPTCP_MAX_ADDR));
  8581. +
  8582. + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
  8583. + mpcb->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
  8584. + mpcb->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
  8585. + }
  8586. + }
  8587. +
  8588. + sk = mptcp_select_ack_sock(meta_sk, 0);
  8589. + if (sk)
  8590. + tcp_send_ack(sk);
  8591. +}
  8592. +
  8593. +static int mptcp_find_address(struct mptcp_loc_addr *mptcp_local,
  8594. + sa_family_t family, union inet_addr *addr)
  8595. +{
  8596. + int i;
  8597. + u8 loc_bits;
  8598. + bool found = false;
  8599. +
  8600. + if (family == AF_INET)
  8601. + loc_bits = mptcp_local->loc4_bits;
  8602. + else
  8603. + loc_bits = mptcp_local->loc6_bits;
  8604. +
  8605. + mptcp_for_each_bit_set(loc_bits, i) {
  8606. + if (family == AF_INET &&
  8607. + mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
  8608. + found = true;
  8609. + break;
  8610. + }
  8611. + if (family == AF_INET6 &&
  8612. + ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
  8613. + &addr->in6)) {
  8614. + found = true;
  8615. + break;
  8616. + }
  8617. + }
  8618. +
  8619. + if (!found)
  8620. + return -1;
  8621. +
  8622. + return i;
  8623. +}
  8624. +
  8625. +static void mptcp_address_worker(struct work_struct *work)
  8626. +{
  8627. + struct delayed_work *delayed_work = container_of(work,
  8628. + struct delayed_work,
  8629. + work);
  8630. + struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
  8631. + struct mptcp_fm_ns,
  8632. + address_worker);
  8633. + struct net *net = fm_ns->net;
  8634. + struct mptcp_addr_event *event = NULL;
  8635. + struct mptcp_loc_addr *mptcp_local, *old;
  8636. + int i, id = -1; /* id is used in the socket-code on a delete-event */
  8637. + bool success; /* Used to indicate if we succeeded handling the event */
  8638. +
  8639. +next_event:
  8640. + success = false;
  8641. + kfree(event);
  8642. +
  8643. + /* First, let's dequeue an event from our event-list */
  8644. + rcu_read_lock_bh();
  8645. + spin_lock(&fm_ns->local_lock);
  8646. +
  8647. + event = list_first_entry_or_null(&fm_ns->events,
  8648. + struct mptcp_addr_event, list);
  8649. + if (!event) {
  8650. + spin_unlock(&fm_ns->local_lock);
  8651. + rcu_read_unlock_bh();
  8652. + return;
  8653. + }
  8654. +
  8655. + list_del(&event->list);
  8656. +
  8657. + mptcp_local = rcu_dereference_bh(fm_ns->local);
  8658. +
  8659. + if (event->code == MPTCP_EVENT_DEL) {
  8660. + id = mptcp_find_address(mptcp_local, event->family, &event->addr);
  8661. +
  8662. + /* Not in the list - so we don't care */
  8663. + if (id < 0)
  8664. + goto duno;
  8665. +
  8666. + old = mptcp_local;
  8667. + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
  8668. + GFP_ATOMIC);
  8669. + if (!mptcp_local)
  8670. + goto duno;
  8671. +
  8672. + if (event->family == AF_INET)
  8673. + mptcp_local->loc4_bits &= ~(1 << id);
  8674. + else
  8675. + mptcp_local->loc6_bits &= ~(1 << id);
  8676. +
  8677. + rcu_assign_pointer(fm_ns->local, mptcp_local);
  8678. + kfree(old);
  8679. + } else {
  8680. + int i = mptcp_find_address(mptcp_local, event->family, &event->addr);
  8681. + int j = i;
  8682. +
  8683. + if (j < 0) {
  8684. + /* Not in the list, so we have to find an empty slot */
  8685. + if (event->family == AF_INET)
  8686. + i = __mptcp_find_free_index(mptcp_local->loc4_bits, -1,
  8687. + mptcp_local->next_v4_index);
  8688. + if (event->family == AF_INET6)
  8689. + i = __mptcp_find_free_index(mptcp_local->loc6_bits, -1,
  8690. + mptcp_local->next_v6_index);
  8691. +
  8692. + if (i < 0) {
  8693. + mptcp_debug("%s no more space\n", __func__);
  8694. + goto duno;
  8695. + }
  8696. +
  8697. + /* It might have been a MOD-event. */
  8698. + event->code = MPTCP_EVENT_ADD;
  8699. + } else {
  8700. + /* Let's check if anything changes */
  8701. + if (event->family == AF_INET &&
  8702. + event->low_prio == mptcp_local->locaddr4[i].low_prio)
  8703. + goto duno;
  8704. +
  8705. + if (event->family == AF_INET6 &&
  8706. + event->low_prio == mptcp_local->locaddr6[i].low_prio)
  8707. + goto duno;
  8708. + }
  8709. +
  8710. + old = mptcp_local;
  8711. + mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
  8712. + GFP_ATOMIC);
  8713. + if (!mptcp_local)
  8714. + goto duno;
  8715. +
  8716. + if (event->family == AF_INET) {
  8717. + mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
  8718. + mptcp_local->locaddr4[i].loc4_id = i + 1;
  8719. + mptcp_local->locaddr4[i].low_prio = event->low_prio;
  8720. + } else {
  8721. + mptcp_local->locaddr6[i].addr = event->addr.in6;
  8722. + mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
  8723. + mptcp_local->locaddr6[i].low_prio = event->low_prio;
  8724. + }
  8725. +
  8726. + if (j < 0) {
  8727. + if (event->family == AF_INET) {
  8728. + mptcp_local->loc4_bits |= (1 << i);
  8729. + mptcp_local->next_v4_index = i + 1;
  8730. + } else {
  8731. + mptcp_local->loc6_bits |= (1 << i);
  8732. + mptcp_local->next_v6_index = i + 1;
  8733. + }
  8734. + }
  8735. +
  8736. + rcu_assign_pointer(fm_ns->local, mptcp_local);
  8737. + kfree(old);
  8738. + }
  8739. + success = true;
  8740. +
  8741. +duno:
  8742. + spin_unlock(&fm_ns->local_lock);
  8743. + rcu_read_unlock_bh();
  8744. +
  8745. + if (!success)
  8746. + goto next_event;
  8747. +
  8748. + /* Now we iterate over the MPTCP-sockets and apply the event. */
  8749. + for (i = 0; i < MPTCP_HASH_SIZE; i++) {
  8750. + struct hlist_nulls_node *node;
  8751. + struct tcp_sock *meta_tp;
  8752. +
  8753. + rcu_read_lock_bh();
  8754. + hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
  8755. + tk_table) {
  8756. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  8757. + struct sock *meta_sk = (struct sock *)meta_tp, *sk;
  8758. + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
  8759. +
  8760. + if (sock_net(meta_sk) != net)
  8761. + continue;
  8762. +
  8763. + if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
  8764. + continue;
  8765. +
  8766. + bh_lock_sock(meta_sk);
  8767. +
  8768. + if (!meta_tp->mpc || !is_meta_sk(meta_sk) ||
  8769. + mpcb->infinite_mapping_snd ||
  8770. + mpcb->infinite_mapping_rcv ||
  8771. + mpcb->send_infinite_mapping)
  8772. + goto next;
  8773. +
  8774. + /* May be that the pm has changed in-between */
  8775. + if (mpcb->pm_ops != &full_mesh)
  8776. + goto next;
  8777. +
  8778. + if (sock_owned_by_user(meta_sk)) {
  8779. + if (!test_and_set_bit(MPTCP_PATH_MANAGER,
  8780. + &meta_tp->tsq_flags))
  8781. + sock_hold(meta_sk);
  8782. +
  8783. + goto next;
  8784. + }
  8785. +
  8786. + if (event->code == MPTCP_EVENT_ADD) {
  8787. + if (event->family == AF_INET)
  8788. + fmp->add_addr++;
  8789. +#if IS_ENABLED(CONFIG_IPV6)
  8790. + if (event->family == AF_INET6)
  8791. + fmp->add_addr++;
  8792. +#endif
  8793. +
  8794. + sk = mptcp_select_ack_sock(meta_sk, 0);
  8795. + if (sk)
  8796. + tcp_send_ack(sk);
  8797. +
  8798. + full_mesh_create_subflows(meta_sk);
  8799. + }
  8800. +
  8801. + if (event->code == MPTCP_EVENT_DEL) {
  8802. + struct sock *sk, *tmpsk;
  8803. + struct mptcp_loc_addr *mptcp_local;
  8804. + bool found = false;
  8805. +
  8806. + mptcp_local = rcu_dereference_bh(fm_ns->local);
  8807. +
  8808. + /* Look for the socket and remove him */
  8809. + mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
  8810. + if ((event->family == AF_INET6 &&
  8811. + (sk->sk_family == AF_INET ||
  8812. + mptcp_v6_is_v4_mapped(sk))) ||
  8813. + (event->family == AF_INET &&
  8814. + (sk->sk_family == AF_INET6 &&
  8815. + !mptcp_v6_is_v4_mapped(sk))))
  8816. + continue;
  8817. +
  8818. + if (event->family == AF_INET &&
  8819. + (sk->sk_family == AF_INET ||
  8820. + mptcp_v6_is_v4_mapped(sk)) &&
  8821. + inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
  8822. + continue;
  8823. +
  8824. + if (event->family == AF_INET6 &&
  8825. + sk->sk_family == AF_INET6 &&
  8826. + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
  8827. + continue;
  8828. +
  8829. + /* Reinject, so that pf = 1 and so we
  8830. + * won't select this one as the
  8831. + * ack-sock.
  8832. + */
  8833. + mptcp_reinject_data(sk, 0);
  8834. +
  8835. + /* A master is special, it has
  8836. + * address-id 0
  8837. + */
  8838. + if (!tcp_sk(sk)->mptcp->loc_id)
  8839. + update_remove_addrs(0, meta_sk, mptcp_local);
  8840. + else if (tcp_sk(sk)->mptcp->loc_id != id)
  8841. + update_remove_addrs(tcp_sk(sk)->mptcp->loc_id, meta_sk, mptcp_local);
  8842. +
  8843. + mptcp_sub_force_close(sk);
  8844. + found = true;
  8845. + }
  8846. +
  8847. + if (!found)
  8848. + goto next;
  8849. +
  8850. + /* The id may have been given by the event,
  8851. + * matching on a local address. And it may not
  8852. + * have matched on one of the above sockets,
  8853. + * because the client never created a subflow.
  8854. + * So, we have to finally remove it here.
  8855. + */
  8856. + if (id > 0)
  8857. + update_remove_addrs(id, meta_sk, mptcp_local);
  8858. + }
  8859. +
  8860. + if (event->code == MPTCP_EVENT_MOD) {
  8861. + struct sock *sk;
  8862. +
  8863. + mptcp_for_each_sk(mpcb, sk) {
  8864. + struct tcp_sock *tp = tcp_sk(sk);
  8865. + if (event->family == AF_INET &&
  8866. + (sk->sk_family == AF_INET ||
  8867. + mptcp_v6_is_v4_mapped(sk)) &&
  8868. + inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
  8869. + if (event->low_prio != tp->mptcp->low_prio) {
  8870. + tp->mptcp->send_mp_prio = 1;
  8871. + tp->mptcp->low_prio = event->low_prio;
  8872. +
  8873. + tcp_send_ack(sk);
  8874. + }
  8875. + }
  8876. +
  8877. + if (event->family == AF_INET6 &&
  8878. + sk->sk_family == AF_INET6 &&
  8879. + !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
  8880. + if (event->low_prio != tp->mptcp->low_prio) {
  8881. + tp->mptcp->send_mp_prio = 1;
  8882. + tp->mptcp->low_prio = event->low_prio;
  8883. +
  8884. + tcp_send_ack(sk);
  8885. + }
  8886. + }
  8887. + }
  8888. + }
  8889. +next:
  8890. + bh_unlock_sock(meta_sk);
  8891. + sock_put(meta_sk);
  8892. + }
  8893. + rcu_read_unlock_bh();
  8894. + }
  8895. + goto next_event;
  8896. +}
  8897. +
  8898. +static struct mptcp_addr_event *lookup_similar_event(struct net *net,
  8899. + struct mptcp_addr_event *event)
  8900. +{
  8901. + struct mptcp_addr_event *eventq;
  8902. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  8903. +
  8904. + list_for_each_entry(eventq, &fm_ns->events, list) {
  8905. + if (eventq->family != event->family)
  8906. + continue;
  8907. + if (event->family == AF_INET) {
  8908. + if (eventq->addr.in.s_addr == event->addr.in.s_addr)
  8909. + return eventq;
  8910. + } else {
  8911. + if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
  8912. + return eventq;
  8913. + }
  8914. + }
  8915. + return NULL;
  8916. +}
  8917. +
  8918. +/* We already hold the net-namespace MPTCP-lock */
  8919. +static void add_pm_event(struct net *net, struct mptcp_addr_event *event)
  8920. +{
  8921. + struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
  8922. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  8923. +
  8924. + if (eventq) {
  8925. + switch (event->code) {
  8926. + case MPTCP_EVENT_DEL:
  8927. + list_del(&eventq->list);
  8928. + kfree(eventq);
  8929. + break;
  8930. + case MPTCP_EVENT_ADD:
  8931. + eventq->low_prio = event->low_prio;
  8932. + eventq->code = MPTCP_EVENT_ADD;
  8933. + return;
  8934. + case MPTCP_EVENT_MOD:
  8935. + eventq->low_prio = event->low_prio;
  8936. + return;
  8937. + }
  8938. + }
  8939. +
  8940. + /* OK, we have to add the new address to the wait queue */
  8941. + eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
  8942. + if (!eventq)
  8943. + return;
  8944. +
  8945. + list_add_tail(&eventq->list, &fm_ns->events);
  8946. +
  8947. + /* Create work-queue */
  8948. + if (!delayed_work_pending(&fm_ns->address_worker))
  8949. + queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
  8950. + msecs_to_jiffies(500));
  8951. +}
  8952. +
  8953. +static void addr4_event_handler(struct in_ifaddr *ifa, unsigned long event,
  8954. + struct net *net)
  8955. +{
  8956. + struct net_device *netdev = ifa->ifa_dev->dev;
  8957. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  8958. + struct mptcp_addr_event mpevent;
  8959. +
  8960. + if (ifa->ifa_scope > RT_SCOPE_LINK ||
  8961. + ipv4_is_loopback(ifa->ifa_local))
  8962. + return;
  8963. +
  8964. + spin_lock_bh(&fm_ns->local_lock);
  8965. +
  8966. + mpevent.family = AF_INET;
  8967. + mpevent.addr.in.s_addr = ifa->ifa_local;
  8968. + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
  8969. +
  8970. + if (event == NETDEV_DOWN || !netif_running(netdev) ||
  8971. + (netdev->flags & IFF_NOMULTIPATH))
  8972. + mpevent.code = MPTCP_EVENT_DEL;
  8973. + else if (event == NETDEV_UP)
  8974. + mpevent.code = MPTCP_EVENT_ADD;
  8975. + else if (event == NETDEV_CHANGE)
  8976. + mpevent.code = MPTCP_EVENT_MOD;
  8977. +
  8978. + add_pm_event(net, &mpevent);
  8979. +
  8980. + spin_unlock_bh(&fm_ns->local_lock);
  8981. + return;
  8982. +}
  8983. +
  8984. +/* React on IPv4-addr add/rem-events */
  8985. +static int mptcp_pm_inetaddr_event(struct notifier_block *this,
  8986. + unsigned long event, void *ptr)
  8987. +{
  8988. + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
  8989. + struct net *net = dev_net(ifa->ifa_dev->dev);
  8990. +
  8991. + addr4_event_handler(ifa, event, net);
  8992. +
  8993. + return NOTIFY_DONE;
  8994. +}
  8995. +
  8996. +static struct notifier_block mptcp_pm_inetaddr_notifier = {
  8997. + .notifier_call = mptcp_pm_inetaddr_event,
  8998. +};
  8999. +
  9000. +#if IS_ENABLED(CONFIG_IPV6)
  9001. +
  9002. +/* IPV6-related address/interface watchers */
  9003. +struct mptcp_dad_data {
  9004. + struct timer_list timer;
  9005. + struct inet6_ifaddr *ifa;
  9006. +};
  9007. +
  9008. +static void dad_callback(unsigned long arg);
  9009. +static int inet6_addr_event(struct notifier_block *this,
  9010. + unsigned long event, void *ptr);
  9011. +
  9012. +static int ipv6_is_in_dad_state(struct inet6_ifaddr *ifa)
  9013. +{
  9014. + return ((ifa->flags & IFA_F_TENTATIVE) &&
  9015. + ifa->state == INET6_IFADDR_STATE_DAD);
  9016. +}
  9017. +
  9018. +static void dad_init_timer(struct mptcp_dad_data *data,
  9019. + struct inet6_ifaddr *ifa)
  9020. +{
  9021. + data->ifa = ifa;
  9022. + data->timer.data = (unsigned long)data;
  9023. + data->timer.function = dad_callback;
  9024. + if (ifa->idev->cnf.rtr_solicit_delay)
  9025. + data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
  9026. + else
  9027. + data->timer.expires = jiffies + (HZ/10);
  9028. +}
  9029. +
  9030. +static void dad_callback(unsigned long arg)
  9031. +{
  9032. + struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
  9033. +
  9034. + if (ipv6_is_in_dad_state(data->ifa)) {
  9035. + dad_init_timer(data, data->ifa);
  9036. + add_timer(&data->timer);
  9037. + } else {
  9038. + inet6_addr_event(NULL, NETDEV_UP, data->ifa);
  9039. + in6_ifa_put(data->ifa);
  9040. + kfree(data);
  9041. + }
  9042. +}
  9043. +
  9044. +static inline void dad_setup_timer(struct inet6_ifaddr *ifa)
  9045. +{
  9046. + struct mptcp_dad_data *data;
  9047. +
  9048. + data = kmalloc(sizeof(*data), GFP_ATOMIC);
  9049. +
  9050. + if (!data)
  9051. + return;
  9052. +
  9053. + init_timer(&data->timer);
  9054. + dad_init_timer(data, ifa);
  9055. + add_timer(&data->timer);
  9056. + in6_ifa_hold(ifa);
  9057. +}
  9058. +
  9059. +static void addr6_event_handler(struct inet6_ifaddr *ifa, unsigned long event,
  9060. + struct net *net)
  9061. +{
  9062. + struct net_device *netdev = ifa->idev->dev;
  9063. + int addr_type = ipv6_addr_type(&ifa->addr);
  9064. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  9065. + struct mptcp_addr_event mpevent;
  9066. +
  9067. + if (ifa->scope > RT_SCOPE_LINK ||
  9068. + addr_type == IPV6_ADDR_ANY ||
  9069. + (addr_type & IPV6_ADDR_LOOPBACK) ||
  9070. + (addr_type & IPV6_ADDR_LINKLOCAL))
  9071. + return;
  9072. +
  9073. + spin_lock_bh(&fm_ns->local_lock);
  9074. +
  9075. + mpevent.family = AF_INET6;
  9076. + mpevent.addr.in6 = ifa->addr;
  9077. + mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
  9078. +
  9079. + if (event == NETDEV_DOWN ||!netif_running(netdev) ||
  9080. + (netdev->flags & IFF_NOMULTIPATH))
  9081. + mpevent.code = MPTCP_EVENT_DEL;
  9082. + else if (event == NETDEV_UP)
  9083. + mpevent.code = MPTCP_EVENT_ADD;
  9084. + else if (event == NETDEV_CHANGE)
  9085. + mpevent.code = MPTCP_EVENT_MOD;
  9086. +
  9087. + add_pm_event(net, &mpevent);
  9088. +
  9089. + spin_unlock_bh(&fm_ns->local_lock);
  9090. + return;
  9091. +}
  9092. +
  9093. +/* React on IPv6-addr add/rem-events */
  9094. +static int inet6_addr_event(struct notifier_block *this, unsigned long event,
  9095. + void *ptr)
  9096. +{
  9097. + struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
  9098. + struct net *net = dev_net(ifa6->idev->dev);
  9099. +
  9100. + if (ipv6_is_in_dad_state(ifa6))
  9101. + dad_setup_timer(ifa6);
  9102. + else
  9103. + addr6_event_handler(ifa6, event, net);
  9104. +
  9105. + return NOTIFY_DONE;
  9106. +}
  9107. +
  9108. +static struct notifier_block inet6_addr_notifier = {
  9109. + .notifier_call = inet6_addr_event,
  9110. +};
  9111. +
  9112. +#endif
  9113. +
  9114. +/* React on ifup/down-events */
  9115. +static int netdev_event(struct notifier_block *this, unsigned long event,
  9116. + void *ptr)
  9117. +{
  9118. + struct net_device *dev = netdev_notifier_info_to_dev(ptr);
  9119. + struct in_device *in_dev;
  9120. +#if IS_ENABLED(CONFIG_IPV6)
  9121. + struct inet6_dev *in6_dev;
  9122. +#endif
  9123. +
  9124. + if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
  9125. + event == NETDEV_CHANGE))
  9126. + return NOTIFY_DONE;
  9127. +
  9128. + rcu_read_lock();
  9129. + in_dev = __in_dev_get_rtnl(dev);
  9130. +
  9131. + if (in_dev) {
  9132. + for_ifa(in_dev) {
  9133. + mptcp_pm_inetaddr_event(NULL, event, ifa);
  9134. + } endfor_ifa(in_dev);
  9135. + }
  9136. +
  9137. +#if IS_ENABLED(CONFIG_IPV6)
  9138. + in6_dev = __in6_dev_get(dev);
  9139. +
  9140. + if (in6_dev) {
  9141. + struct inet6_ifaddr *ifa6;
  9142. + list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
  9143. + inet6_addr_event(NULL, event, ifa6);
  9144. + }
  9145. +#endif
  9146. +
  9147. + rcu_read_unlock();
  9148. + return NOTIFY_DONE;
  9149. +}
  9150. +
  9151. +static struct notifier_block mptcp_pm_netdev_notifier = {
  9152. + .notifier_call = netdev_event,
  9153. +};
  9154. +
  9155. +static void full_mesh_new_session(struct sock *meta_sk, int index)
  9156. +{
  9157. + struct mptcp_loc_addr *mptcp_local;
  9158. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  9159. + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
  9160. + struct net *net = sock_net(meta_sk);
  9161. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  9162. + struct sock *sk;
  9163. + int i;
  9164. +
  9165. + if (index == -1) {
  9166. + mptcp_fallback_default(mpcb);
  9167. + return;
  9168. + }
  9169. +
  9170. + /* Initialize workqueue-struct */
  9171. + INIT_WORK(&fmp->subflow_work, create_subflow_worker);
  9172. + INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
  9173. + fmp->mpcb = mpcb;
  9174. +
  9175. + sk = mptcp_select_ack_sock(meta_sk, 0);
  9176. +
  9177. + rcu_read_lock();
  9178. + mptcp_local = rcu_dereference(fm_ns->local);
  9179. +
  9180. + /* Look for the address among the local addresses */
  9181. + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
  9182. + __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
  9183. +
  9184. + /* We do not need to announce the initial subflow's address again */
  9185. + if ((meta_sk->sk_family == AF_INET ||
  9186. + mptcp_v6_is_v4_mapped(meta_sk)) &&
  9187. + inet_sk(meta_sk)->inet_saddr == ifa_address)
  9188. + continue;
  9189. +
  9190. + fmp->add_addr++;
  9191. +
  9192. + if (sk)
  9193. + tcp_send_ack(sk);
  9194. + }
  9195. +
  9196. +#if IS_ENABLED(CONFIG_IPV6)
  9197. + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
  9198. + struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
  9199. +
  9200. + /* We do not need to announce the initial subflow's address again */
  9201. + if (meta_sk->sk_family == AF_INET6 &&
  9202. + ipv6_addr_equal(&inet6_sk(meta_sk)->saddr, ifa6))
  9203. + continue;
  9204. +
  9205. + fmp->add_addr++;
  9206. +
  9207. + if (sk)
  9208. + tcp_send_ack(sk);
  9209. + }
  9210. +#endif
  9211. +
  9212. + rcu_read_unlock();
  9213. +
  9214. + if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk))
  9215. + fmp->announced_addrs_v4 |= (1 << index);
  9216. + else
  9217. + fmp->announced_addrs_v6 |= (1 << index);
  9218. +}
  9219. +
  9220. +static void full_mesh_create_subflows(struct sock *meta_sk)
  9221. +{
  9222. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  9223. + struct fullmesh_priv *pm_priv = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
  9224. +
  9225. + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
  9226. + mpcb->send_infinite_mapping ||
  9227. + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
  9228. + return;
  9229. +
  9230. + /* The master may not yet be fully established (address added through
  9231. + * mptcp_update_metasocket). Then, we should not attempt to create new
  9232. + * subflows.
  9233. + */
  9234. + if (mpcb->master_sk &&
  9235. + !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
  9236. + return;
  9237. +
  9238. + if (!work_pending(&pm_priv->subflow_work)) {
  9239. + sock_hold(meta_sk);
  9240. + queue_work(mptcp_wq, &pm_priv->subflow_work);
  9241. + }
  9242. +}
  9243. +
  9244. +/* Called upon release_sock, if the socket was owned by the user during
  9245. + * a path-management event.
  9246. + */
  9247. +static void full_mesh_release_sock(struct sock *meta_sk)
  9248. +{
  9249. + struct mptcp_loc_addr *mptcp_local;
  9250. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  9251. + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
  9252. + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
  9253. + struct sock *sk, *tmpsk;
  9254. + int i;
  9255. +
  9256. + rcu_read_lock();
  9257. + mptcp_local = rcu_dereference(fm_ns->local);
  9258. +
  9259. + /* First, detect modifications or additions */
  9260. + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
  9261. + struct in_addr ifa = mptcp_local->locaddr4[i].addr;
  9262. + bool found = false;
  9263. +
  9264. + mptcp_for_each_sk(mpcb, sk) {
  9265. + struct tcp_sock *tp = tcp_sk(sk);
  9266. +
  9267. + if (sk->sk_family == AF_INET6 &&
  9268. + !mptcp_v6_is_v4_mapped(sk))
  9269. + continue;
  9270. +
  9271. + if (inet_sk(sk)->inet_saddr != ifa.s_addr)
  9272. + continue;
  9273. +
  9274. + found = true;
  9275. +
  9276. + if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
  9277. + tp->mptcp->send_mp_prio = 1;
  9278. + tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
  9279. +
  9280. + tcp_send_ack(sk);
  9281. + }
  9282. + }
  9283. +
  9284. + if (!found) {
  9285. + fmp->add_addr++;
  9286. +
  9287. + sk = mptcp_select_ack_sock(meta_sk, 0);
  9288. + if (sk)
  9289. + tcp_send_ack(sk);
  9290. + full_mesh_create_subflows(meta_sk);
  9291. + }
  9292. + }
  9293. +
  9294. +#if IS_ENABLED(CONFIG_IPV6)
  9295. + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
  9296. + struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
  9297. + bool found = false;
  9298. +
  9299. + mptcp_for_each_sk(mpcb, sk) {
  9300. + struct tcp_sock *tp = tcp_sk(sk);
  9301. +
  9302. + if (sk->sk_family == AF_INET ||
  9303. + mptcp_v6_is_v4_mapped(sk))
  9304. + continue;
  9305. +
  9306. + if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
  9307. + continue;
  9308. +
  9309. + found = true;
  9310. +
  9311. + if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
  9312. + tp->mptcp->send_mp_prio = 1;
  9313. + tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
  9314. +
  9315. + tcp_send_ack(sk);
  9316. + }
  9317. + }
  9318. +
  9319. + if (!found) {
  9320. + fmp->add_addr++;
  9321. +
  9322. + sk = mptcp_select_ack_sock(meta_sk, 0);
  9323. + if (sk)
  9324. + tcp_send_ack(sk);
  9325. + full_mesh_create_subflows(meta_sk);
  9326. + }
  9327. + }
  9328. +#endif
  9329. +
  9330. + /* Now, detect address-removals */
  9331. + mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
  9332. + bool shall_remove = true;
  9333. +
  9334. + if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
  9335. + mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
  9336. + if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
  9337. + shall_remove = false;
  9338. + break;
  9339. + }
  9340. + }
  9341. + } else {
  9342. + mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
  9343. + if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
  9344. + shall_remove = false;
  9345. + break;
  9346. + }
  9347. + }
  9348. + }
  9349. +
  9350. + if (shall_remove) {
  9351. + /* Reinject, so that pf = 1 and so we
  9352. + * won't select this one as the
  9353. + * ack-sock.
  9354. + */
  9355. + mptcp_reinject_data(sk, 0);
  9356. +
  9357. + update_remove_addrs(tcp_sk(sk)->mptcp->loc_id, meta_sk,
  9358. + mptcp_local);
  9359. +
  9360. + if (mpcb->master_sk == sk)
  9361. + update_remove_addrs(0, meta_sk, mptcp_local);
  9362. +
  9363. + mptcp_sub_force_close(sk);
  9364. + }
  9365. + }
  9366. + rcu_read_unlock();
  9367. +}
  9368. +
  9369. +static int full_mesh_get_local_index(sa_family_t family, union inet_addr *addr,
  9370. + struct net *net)
  9371. +{
  9372. + struct mptcp_loc_addr *mptcp_local;
  9373. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  9374. + int index;
  9375. +
  9376. + /* Handle the backup-flows */
  9377. + rcu_read_lock();
  9378. + mptcp_local = rcu_dereference(fm_ns->local);
  9379. +
  9380. + index = mptcp_find_address(mptcp_local, family, addr);
  9381. +
  9382. + rcu_read_unlock();
  9383. +
  9384. + return index;
  9385. +}
  9386. +
  9387. +static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,
  9388. + struct net *net)
  9389. +{
  9390. + struct mptcp_loc_addr *mptcp_local;
  9391. + struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
  9392. + int index, id = -1;
  9393. +
  9394. + /* Handle the backup-flows */
  9395. + rcu_read_lock();
  9396. + mptcp_local = rcu_dereference(fm_ns->local);
  9397. +
  9398. + index = mptcp_find_address(mptcp_local, family, addr);
  9399. +
  9400. + if (index != -1) {
  9401. + if (family == AF_INET)
  9402. + id = mptcp_local->locaddr4[index].loc4_id;
  9403. + else
  9404. + id = mptcp_local->locaddr6[index].loc6_id;
  9405. + }
  9406. +
  9407. +
  9408. + rcu_read_unlock();
  9409. +
  9410. + return id;
  9411. +}
  9412. +
  9413. +static void full_mesh_addr_signal(struct sock *sk, unsigned *size,
  9414. + struct tcp_out_options *opts,
  9415. + struct sk_buff *skb)
  9416. +{
  9417. + struct tcp_sock *tp = tcp_sk(sk);
  9418. + struct mptcp_cb *mpcb = tp->mpcb;
  9419. + struct fullmesh_priv *fmp = (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
  9420. + struct mptcp_loc_addr *mptcp_local;
  9421. + struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
  9422. + int remove_addr_len;
  9423. + u8 unannouncedv4, unannouncedv6;
  9424. +
  9425. + if (likely(!fmp->add_addr))
  9426. + goto remove_addr;
  9427. +
  9428. + rcu_read_lock();
  9429. + mptcp_local = rcu_dereference(fm_ns->local);
  9430. +
  9431. + /* IPv4 */
  9432. + unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
  9433. + if (unannouncedv4 &&
  9434. + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
  9435. + int ind = mptcp_find_free_index(~unannouncedv4);
  9436. +
  9437. + opts->options |= OPTION_MPTCP;
  9438. + opts->mptcp_options |= OPTION_ADD_ADDR;
  9439. + opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
  9440. + opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
  9441. + opts->add_addr_v4 = 1;
  9442. +
  9443. + if (skb) {
  9444. + fmp->announced_addrs_v4 |= (1 << ind);
  9445. + fmp->add_addr--;
  9446. + }
  9447. + *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
  9448. + }
  9449. +
  9450. + /* IPv6 */
  9451. + unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
  9452. + if (unannouncedv6 &&
  9453. + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
  9454. + int ind = mptcp_find_free_index(~unannouncedv6);
  9455. +
  9456. + opts->options |= OPTION_MPTCP;
  9457. + opts->mptcp_options |= OPTION_ADD_ADDR;
  9458. + opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
  9459. + opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
  9460. + opts->add_addr_v6 = 1;
  9461. +
  9462. + if (skb) {
  9463. + fmp->announced_addrs_v6 |= (1 << ind);
  9464. + fmp->add_addr--;
  9465. + }
  9466. + *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
  9467. + }
  9468. +
  9469. + rcu_read_unlock();
  9470. +
  9471. + if (!unannouncedv4 && !unannouncedv6 && skb) {
  9472. + fmp->add_addr--;
  9473. + }
  9474. +
  9475. +remove_addr:
  9476. + if (likely(!fmp->remove_addrs))
  9477. + return;
  9478. +
  9479. + remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
  9480. + if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
  9481. + return;
  9482. +
  9483. + opts->options |= OPTION_MPTCP;
  9484. + opts->mptcp_options |= OPTION_REMOVE_ADDR;
  9485. + opts->remove_addrs = fmp->remove_addrs;
  9486. + *size += remove_addr_len;
  9487. + if (skb)
  9488. + fmp->remove_addrs = 0;
  9489. +}
  9490. +
  9491. +static int mptcp_fm_init_net(struct net *net)
  9492. +{
  9493. + struct mptcp_loc_addr *mptcp_local;
  9494. + struct mptcp_fm_ns *fm_ns;
  9495. +
  9496. + fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
  9497. + if (!fm_ns)
  9498. + return -ENOBUFS;
  9499. +
  9500. + mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
  9501. + if (!mptcp_local) {
  9502. + kfree(fm_ns);
  9503. + return -ENOBUFS;
  9504. + }
  9505. +
  9506. + mptcp_local->next_v4_index = 1;
  9507. +
  9508. + rcu_assign_pointer(fm_ns->local, mptcp_local);
  9509. + INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
  9510. + INIT_LIST_HEAD(&fm_ns->events);
  9511. + spin_lock_init(&fm_ns->local_lock);
  9512. + fm_ns->net = net;
  9513. + net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
  9514. +
  9515. + return 0;
  9516. +}
  9517. +
  9518. +static void mptcp_fm_exit_net(struct net *net)
  9519. +{
  9520. + struct mptcp_addr_event *eventq, *tmp;
  9521. + struct mptcp_fm_ns *fm_ns;
  9522. + struct mptcp_loc_addr *mptcp_local;
  9523. +
  9524. + fm_ns = fm_get_ns(net);
  9525. + cancel_delayed_work_sync(&fm_ns->address_worker);
  9526. +
  9527. + rcu_read_lock_bh();
  9528. +
  9529. + mptcp_local = rcu_dereference_bh(fm_ns->local);
  9530. + kfree(mptcp_local);
  9531. +
  9532. + spin_lock(&fm_ns->local_lock);
  9533. + list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
  9534. + list_del(&eventq->list);
  9535. + kfree(eventq);
  9536. + }
  9537. + spin_unlock(&fm_ns->local_lock);
  9538. +
  9539. + rcu_read_unlock_bh();
  9540. +
  9541. + kfree(fm_ns);
  9542. +}
  9543. +
  9544. +static struct pernet_operations full_mesh_net_ops = {
  9545. + .init = mptcp_fm_init_net,
  9546. + .exit = mptcp_fm_exit_net,
  9547. +};
  9548. +
  9549. +static struct mptcp_pm_ops full_mesh __read_mostly = {
  9550. + .new_session = full_mesh_new_session,
  9551. + .release_sock = full_mesh_release_sock,
  9552. + .fully_established = full_mesh_create_subflows,
  9553. + .new_remote_address = full_mesh_create_subflows,
  9554. + .get_local_index = full_mesh_get_local_index,
  9555. + .get_local_id = full_mesh_get_local_id,
  9556. + .addr_signal = full_mesh_addr_signal,
  9557. + .name = "fullmesh",
  9558. + .owner = THIS_MODULE,
  9559. +};
  9560. +
  9561. +/* General initialization of MPTCP_PM */
  9562. +static int __init full_mesh_register(void)
  9563. +{
  9564. + int ret;
  9565. +
  9566. + BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
  9567. +
  9568. + ret = register_pernet_subsys(&full_mesh_net_ops);
  9569. + if (ret)
  9570. + goto out;
  9571. +
  9572. + ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
  9573. + if (ret)
  9574. + goto err_reg_inetaddr;
  9575. + ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
  9576. + if (ret)
  9577. + goto err_reg_netdev;
  9578. +
  9579. +#if IS_ENABLED(CONFIG_IPV6)
  9580. + ret = register_inet6addr_notifier(&inet6_addr_notifier);
  9581. + if (ret)
  9582. + goto err_reg_inet6addr;
  9583. +#endif
  9584. +
  9585. + ret = mptcp_register_path_manager(&full_mesh);
  9586. + if (ret)
  9587. + goto err_reg_pm;
  9588. +
  9589. +out:
  9590. + return ret;
  9591. +
  9592. +
  9593. +err_reg_pm:
  9594. +#if IS_ENABLED(CONFIG_IPV6)
  9595. + unregister_inet6addr_notifier(&inet6_addr_notifier);
  9596. +err_reg_inet6addr:
  9597. +#endif
  9598. + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
  9599. +err_reg_netdev:
  9600. + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
  9601. +err_reg_inetaddr:
  9602. + unregister_pernet_subsys(&full_mesh_net_ops);
  9603. + goto out;
  9604. +}
  9605. +
  9606. +static void full_mesh_unregister(void)
  9607. +{
  9608. +#if IS_ENABLED(CONFIG_IPV6)
  9609. + unregister_inet6addr_notifier(&inet6_addr_notifier);
  9610. +#endif
  9611. + unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
  9612. + unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
  9613. + unregister_pernet_subsys(&full_mesh_net_ops);
  9614. + mptcp_unregister_path_manager(&full_mesh);
  9615. +}
  9616. +
  9617. +module_init(full_mesh_register);
  9618. +module_exit(full_mesh_unregister);
  9619. +
  9620. +MODULE_AUTHOR("Christoph Paasch");
  9621. +MODULE_LICENSE("GPL");
  9622. +MODULE_DESCRIPTION("Full-Mesh MPTCP");
  9623. +MODULE_VERSION("0.88");
  9624. diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c
  9625. new file mode 100644
  9626. index 0000000..f3c9057
  9627. --- /dev/null
  9628. +++ b/net/mptcp/mptcp_input.c
  9629. @@ -0,0 +1,2254 @@
  9630. +/*
  9631. + * MPTCP implementation - Sending side
  9632. + *
  9633. + * Initial Design & Implementation:
  9634. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  9635. + *
  9636. + * Current Maintainer & Author:
  9637. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  9638. + *
  9639. + * Additional authors:
  9640. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  9641. + * Gregory Detal <gregory.detal@uclouvain.be>
  9642. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  9643. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  9644. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  9645. + * Andreas Ripke <ripke@neclab.eu>
  9646. + * Vlad Dogaru <vlad.dogaru@intel.com>
  9647. + * Octavian Purdila <octavian.purdila@intel.com>
  9648. + * John Ronan <jronan@tssg.org>
  9649. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  9650. + * Brandon Heller <brandonh@stanford.edu>
  9651. + *
  9652. + *
  9653. + * This program is free software; you can redistribute it and/or
  9654. + * modify it under the terms of the GNU General Public License
  9655. + * as published by the Free Software Foundation; either version
  9656. + * 2 of the License, or (at your option) any later version.
  9657. + */
  9658. +
  9659. +#include <asm/unaligned.h>
  9660. +
  9661. +#include <net/mptcp.h>
  9662. +#include <net/mptcp_v4.h>
  9663. +#include <net/mptcp_v6.h>
  9664. +
  9665. +#include <linux/kconfig.h>
  9666. +
  9667. +/* is seq1 < seq2 ? */
  9668. +static inline int before64(const u64 seq1, const u64 seq2)
  9669. +{
  9670. + return (s64)(seq1 - seq2) < 0;
  9671. +}
  9672. +
  9673. +/* is seq1 > seq2 ? */
  9674. +#define after64(seq1, seq2) before64(seq2, seq1)
  9675. +
  9676. +static inline void mptcp_become_fully_estab(struct sock *sk)
  9677. +{
  9678. + tcp_sk(sk)->mptcp->fully_established = 1;
  9679. +
  9680. + if (is_master_tp(tcp_sk(sk)) &&
  9681. + tcp_sk(sk)->mpcb->pm_ops->fully_established)
  9682. + tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
  9683. +}
  9684. +
  9685. +/* Similar to tcp_tso_acked without any memory accounting */
  9686. +static inline int mptcp_tso_acked_reinject(struct sock *sk, struct sk_buff *skb)
  9687. +{
  9688. + struct tcp_sock *tp = tcp_sk(sk);
  9689. + u32 packets_acked, len;
  9690. +
  9691. + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
  9692. +
  9693. + packets_acked = tcp_skb_pcount(skb);
  9694. +
  9695. + if (skb_unclone(skb, GFP_ATOMIC))
  9696. + return 0;
  9697. +
  9698. + len = tp->snd_una - TCP_SKB_CB(skb)->seq;
  9699. + __pskb_trim_head(skb, len);
  9700. +
  9701. + TCP_SKB_CB(skb)->seq += len;
  9702. + skb->ip_summed = CHECKSUM_PARTIAL;
  9703. + skb->truesize -= len;
  9704. +
  9705. + /* Any change of skb->len requires recalculation of tso factor. */
  9706. + if (tcp_skb_pcount(skb) > 1)
  9707. + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
  9708. + packets_acked -= tcp_skb_pcount(skb);
  9709. +
  9710. + if (packets_acked) {
  9711. + BUG_ON(tcp_skb_pcount(skb) == 0);
  9712. + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
  9713. + }
  9714. +
  9715. + return packets_acked;
  9716. +}
  9717. +
  9718. +/**
  9719. + * Cleans the meta-socket retransmission queue and the reinject-queue.
  9720. + * @sk must be the metasocket.
  9721. + */
  9722. +static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
  9723. +{
  9724. + struct sk_buff *skb, *tmp;
  9725. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  9726. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  9727. + bool acked = false;
  9728. + u32 acked_pcount;
  9729. +
  9730. + while ((skb = tcp_write_queue_head(meta_sk)) &&
  9731. + skb != tcp_send_head(meta_sk)) {
  9732. + bool fully_acked = true;
  9733. +
  9734. + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
  9735. + if (tcp_skb_pcount(skb) == 1 ||
  9736. + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
  9737. + break;
  9738. +
  9739. + acked_pcount = tcp_tso_acked(meta_sk, skb);
  9740. + if (!acked_pcount)
  9741. + break;
  9742. +
  9743. + fully_acked = false;
  9744. + } else {
  9745. + acked_pcount = tcp_skb_pcount(skb);
  9746. + }
  9747. +
  9748. + acked = true;
  9749. + meta_tp->packets_out -= acked_pcount;
  9750. + meta_tp->retrans_stamp = 0;
  9751. +
  9752. + if (!fully_acked)
  9753. + break;
  9754. +
  9755. + tcp_unlink_write_queue(skb, meta_sk);
  9756. +
  9757. + if (mptcp_is_data_fin(skb)) {
  9758. + struct sock *sk_it;
  9759. +
  9760. + /* DATA_FIN has been acknowledged - now we can close
  9761. + * the subflows
  9762. + */
  9763. + mptcp_for_each_sk(mpcb, sk_it) {
  9764. + unsigned long delay = 0;
  9765. +
  9766. + /* If we are the passive closer, don't trigger
  9767. + * subflow-fin until the subflow has been finned
  9768. + * by the peer - thus we add a delay.
  9769. + */
  9770. + if (mpcb->passive_close &&
  9771. + sk_it->sk_state == TCP_ESTABLISHED)
  9772. + delay = inet_csk(sk_it)->icsk_rto << 3;
  9773. +
  9774. + mptcp_sub_close(sk_it, delay);
  9775. + }
  9776. + }
  9777. + sk_wmem_free_skb(meta_sk, skb);
  9778. + }
  9779. + /* Remove acknowledged data from the reinject queue */
  9780. + skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
  9781. + if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
  9782. + if (tcp_skb_pcount(skb) == 1 ||
  9783. + !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
  9784. + break;
  9785. +
  9786. + mptcp_tso_acked_reinject(meta_sk, skb);
  9787. + break;
  9788. + }
  9789. +
  9790. + __skb_unlink(skb, &mpcb->reinject_queue);
  9791. + __kfree_skb(skb);
  9792. + }
  9793. +
  9794. + if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
  9795. + meta_tp->snd_up = meta_tp->snd_una;
  9796. +
  9797. + if (acked) {
  9798. + tcp_rearm_rto(meta_sk);
  9799. + /* Normally this is done in tcp_try_undo_loss - but MPTCP
  9800. + * does not call this function.
  9801. + */
  9802. + inet_csk(meta_sk)->icsk_retransmits = 0;
  9803. + }
  9804. +}
  9805. +
  9806. +/* Inspired by tcp_rcv_state_process */
  9807. +static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
  9808. + const struct sk_buff *skb, u32 data_seq,
  9809. + u16 data_len)
  9810. +{
  9811. + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
  9812. + struct tcphdr *th = tcp_hdr(skb);
  9813. +
  9814. + /* State-machine handling if FIN has been enqueued and he has
  9815. + * been acked (snd_una == write_seq) - it's important that this
  9816. + * here is after sk_wmem_free_skb because otherwise
  9817. + * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
  9818. + */
  9819. + switch (meta_sk->sk_state) {
  9820. + case TCP_FIN_WAIT1:
  9821. + if (meta_tp->snd_una == meta_tp->write_seq) {
  9822. + struct dst_entry *dst = __sk_dst_get(meta_sk);
  9823. +
  9824. + tcp_set_state(meta_sk, TCP_FIN_WAIT2);
  9825. + meta_sk->sk_shutdown |= SEND_SHUTDOWN;
  9826. +
  9827. + dst = __sk_dst_get(sk);
  9828. + if (dst)
  9829. + dst_confirm(dst);
  9830. +
  9831. + if (!sock_flag(meta_sk, SOCK_DEAD)) {
  9832. + /* Wake up lingering close() */
  9833. + meta_sk->sk_state_change(meta_sk);
  9834. + } else {
  9835. + int tmo;
  9836. +
  9837. + if (meta_tp->linger2 < 0 ||
  9838. + (data_len &&
  9839. + after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
  9840. + meta_tp->rcv_nxt))) {
  9841. + mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
  9842. + tcp_done(meta_sk);
  9843. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
  9844. + return 1;
  9845. + }
  9846. +
  9847. + tmo = tcp_fin_time(meta_sk);
  9848. + if (tmo > TCP_TIMEWAIT_LEN) {
  9849. + inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
  9850. + } else if (mptcp_is_data_fin2(skb, tp) ||
  9851. + sock_owned_by_user(meta_sk)) {
  9852. + /* Bad case. We could lose such FIN otherwise.
  9853. + * It is not a big problem, but it looks confusing
  9854. + * and not so rare event. We still can lose it now,
  9855. + * if it spins in bh_lock_sock(), but it is really
  9856. + * marginal case.
  9857. + */
  9858. + inet_csk_reset_keepalive_timer(meta_sk, tmo);
  9859. + } else {
  9860. + tcp_time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
  9861. + }
  9862. + }
  9863. + }
  9864. + break;
  9865. + case TCP_CLOSING:
  9866. + case TCP_LAST_ACK:
  9867. + if (meta_tp->snd_una == meta_tp->write_seq) {
  9868. + tcp_done(meta_sk);
  9869. + return 1;
  9870. + }
  9871. + break;
  9872. + }
  9873. +
  9874. + /* step 7: process the segment text */
  9875. + switch (meta_sk->sk_state) {
  9876. + case TCP_FIN_WAIT1:
  9877. + case TCP_FIN_WAIT2:
  9878. + /* RFC 793 says to queue data in these states,
  9879. + * RFC 1122 says we MUST send a reset.
  9880. + * BSD 4.4 also does reset.
  9881. + */
  9882. + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
  9883. + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
  9884. + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
  9885. + !mptcp_is_data_fin2(skb, tp)) {
  9886. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
  9887. + mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
  9888. + tcp_reset(meta_sk);
  9889. + return 1;
  9890. + }
  9891. + }
  9892. + break;
  9893. + }
  9894. +
  9895. + return 0;
  9896. +}
  9897. +
  9898. +/**
  9899. + * @return:
  9900. + * i) 1: Everything's fine.
  9901. + * ii) -1: A reset has been sent on the subflow - csum-failure
  9902. + * iii) 0: csum-failure but no reset sent, because it's the last subflow.
  9903. + * Last packet should not be destroyed by the caller because it has
  9904. + * been done here.
  9905. + */
  9906. +static int mptcp_verif_dss_csum(struct sock *sk)
  9907. +{
  9908. + struct tcp_sock *tp = tcp_sk(sk);
  9909. + struct sk_buff *tmp, *tmp1, *last = NULL;
  9910. + __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
  9911. + int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
  9912. + int iter = 0;
  9913. +
  9914. + skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
  9915. + unsigned int csum_len;
  9916. +
  9917. + if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
  9918. + /* Mapping ends in the middle of the packet -
  9919. + * csum only these bytes
  9920. + */
  9921. + csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
  9922. + else
  9923. + csum_len = tmp->len;
  9924. +
  9925. + offset = 0;
  9926. + if (overflowed) {
  9927. + char first_word[4];
  9928. + first_word[0] = 0;
  9929. + first_word[1] = 0;
  9930. + first_word[2] = 0;
  9931. + first_word[3] = *(tmp->data);
  9932. + csum_tcp = csum_partial(first_word, 4, csum_tcp);
  9933. + offset = 1;
  9934. + csum_len--;
  9935. + overflowed = 0;
  9936. + }
  9937. +
  9938. + csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
  9939. +
  9940. + /* Was it on an odd-length? Then we have to merge the next byte
  9941. + * correctly (see above)
  9942. + */
  9943. + if (csum_len != (csum_len & (~1)))
  9944. + overflowed = 1;
  9945. +
  9946. + if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
  9947. + __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
  9948. +
  9949. + /* If a 64-bit dss is present, we increase the offset
  9950. + * by 4 bytes, as the high-order 64-bits will be added
  9951. + * in the final csum_partial-call.
  9952. + */
  9953. + u32 offset = skb_transport_offset(tmp) +
  9954. + TCP_SKB_CB(tmp)->dss_off;
  9955. + if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
  9956. + offset += 4;
  9957. +
  9958. + csum_tcp = skb_checksum(tmp, offset,
  9959. + MPTCP_SUB_LEN_SEQ_CSUM,
  9960. + csum_tcp);
  9961. +
  9962. + csum_tcp = csum_partial(&data_seq,
  9963. + sizeof(data_seq), csum_tcp);
  9964. +
  9965. + dss_csum_added = 1; /* Just do it once */
  9966. + }
  9967. + last = tmp;
  9968. + iter++;
  9969. +
  9970. + if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
  9971. + !before(TCP_SKB_CB(tmp1)->seq,
  9972. + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
  9973. + break;
  9974. + }
  9975. +
  9976. + /* Now, checksum must be 0 */
  9977. + if (unlikely(csum_fold(csum_tcp))) {
  9978. + pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
  9979. + __func__, csum_fold(csum_tcp),
  9980. + TCP_SKB_CB(last)->seq, dss_csum_added, overflowed,
  9981. + iter);
  9982. +
  9983. + tp->mptcp->send_mp_fail = 1;
  9984. +
  9985. + /* map_data_seq is the data-seq number of the
  9986. + * mapping we are currently checking
  9987. + */
  9988. + tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
  9989. +
  9990. + if (tp->mpcb->cnt_subflows > 1) {
  9991. + mptcp_send_reset(sk);
  9992. + ans = -1;
  9993. + } else {
  9994. + tp->mpcb->send_infinite_mapping = 1;
  9995. +
  9996. + /* Need to purge the rcv-queue as it's no more valid */
  9997. + while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
  9998. + tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
  9999. + kfree_skb(tmp);
  10000. + }
  10001. +
  10002. + ans = 0;
  10003. + }
  10004. + }
  10005. +
  10006. + return ans;
  10007. +}
  10008. +
  10009. +static inline void mptcp_prepare_skb(struct sk_buff *skb, struct sk_buff *next,
  10010. + struct sock *sk)
  10011. +{
  10012. + struct tcp_sock *tp = tcp_sk(sk);
  10013. + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  10014. + /* Adapt data-seq's to the packet itself. We kinda transform the
  10015. + * dss-mapping to a per-packet granularity. This is necessary to
  10016. + * correctly handle overlapping mappings coming from different
  10017. + * subflows. Otherwise it would be a complete mess.
  10018. + */
  10019. + tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
  10020. + tcb->end_seq = tcb->seq + skb->len;
  10021. +
  10022. + /* If cur is the last one in the rcv-queue (or the last one for this
  10023. + * mapping), and data_fin is enqueued, the end_data_seq is +1.
  10024. + */
  10025. + if (skb_queue_is_last(&sk->sk_receive_queue, skb) ||
  10026. + after(TCP_SKB_CB(next)->end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
  10027. + tcb->end_seq += tp->mptcp->map_data_fin;
  10028. +
  10029. + /* We manually set the fin-flag if it is a data-fin. For easy
  10030. + * processing in tcp_recvmsg.
  10031. + */
  10032. + if (mptcp_is_data_fin2(skb, tp))
  10033. + tcp_hdr(skb)->fin = 1;
  10034. + else
  10035. + tcp_hdr(skb)->fin = 0;
  10036. + } else {
  10037. + /* We may have a subflow-fin with data but without data-fin */
  10038. + tcp_hdr(skb)->fin = 0;
  10039. + }
  10040. +}
  10041. +
  10042. +/**
  10043. + * @return: 1 if the segment has been eaten and can be suppressed,
  10044. + * otherwise 0.
  10045. + */
  10046. +static inline int mptcp_direct_copy(struct sk_buff *skb, struct sock *meta_sk)
  10047. +{
  10048. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  10049. + int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);
  10050. + int eaten = 0;
  10051. +
  10052. + __set_current_state(TASK_RUNNING);
  10053. +
  10054. + local_bh_enable();
  10055. + if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {
  10056. + meta_tp->ucopy.len -= chunk;
  10057. + meta_tp->copied_seq += chunk;
  10058. + eaten = (chunk == skb->len);
  10059. + tcp_rcv_space_adjust(meta_sk);
  10060. + }
  10061. + local_bh_disable();
  10062. + return eaten;
  10063. +}
  10064. +
  10065. +static inline void mptcp_reset_mapping(struct tcp_sock *tp)
  10066. +{
  10067. + tp->mptcp->map_data_len = 0;
  10068. + tp->mptcp->map_data_seq = 0;
  10069. + tp->mptcp->map_subseq = 0;
  10070. + tp->mptcp->map_data_fin = 0;
  10071. + tp->mptcp->mapping_present = 0;
  10072. +}
  10073. +
  10074. +/* The DSS-mapping received on the sk only covers the second half of the skb
  10075. + * (cut at seq). We trim the head from the skb.
  10076. + * Data will be freed upon kfree().
  10077. + *
  10078. + * Inspired by tcp_trim_head().
  10079. + */
  10080. +static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
  10081. +{
  10082. + int len = seq - TCP_SKB_CB(skb)->seq;
  10083. + u32 new_seq = TCP_SKB_CB(skb)->seq + len;
  10084. +
  10085. + if (len < skb_headlen(skb))
  10086. + __skb_pull(skb, len);
  10087. + else
  10088. + __pskb_trim_head(skb, len - skb_headlen(skb));
  10089. +
  10090. + TCP_SKB_CB(skb)->seq = new_seq;
  10091. +
  10092. + skb->truesize -= len;
  10093. + atomic_sub(len, &sk->sk_rmem_alloc);
  10094. + sk_mem_uncharge(sk, len);
  10095. +}
  10096. +
  10097. +/* The DSS-mapping received on the sk only covers the first half of the skb
  10098. + * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
  10099. + * as further packets may resolve the mapping of the second half of data.
  10100. + *
  10101. + * Inspired by tcp_fragment().
  10102. + */
  10103. +static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
  10104. +{
  10105. + struct sk_buff *buff;
  10106. + int nsize;
  10107. + int nlen, len;
  10108. +
  10109. + len = seq - TCP_SKB_CB(skb)->seq;
  10110. + nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
  10111. + if (nsize < 0)
  10112. + nsize = 0;
  10113. +
  10114. + /* Get a new skb... force flag on. */
  10115. + buff = alloc_skb(nsize, GFP_ATOMIC);
  10116. + if (buff == NULL)
  10117. + return -ENOMEM;
  10118. +
  10119. + skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
  10120. + skb_reset_transport_header(buff);
  10121. +
  10122. + tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;
  10123. + tcp_hdr(skb)->fin = 0;
  10124. +
  10125. + /* We absolutly need to call skb_set_owner_r before refreshing the
  10126. + * truesize of buff, otherwise the moved data will account twice.
  10127. + */
  10128. + skb_set_owner_r(buff, sk);
  10129. + nlen = skb->len - len - nsize;
  10130. + buff->truesize += nlen;
  10131. + skb->truesize -= nlen;
  10132. +
  10133. + /* Correct the sequence numbers. */
  10134. + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  10135. + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  10136. + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  10137. +
  10138. + skb_split(skb, buff, len);
  10139. +
  10140. + __skb_queue_after(&sk->sk_receive_queue, skb, buff);
  10141. +
  10142. + return 0;
  10143. +}
  10144. +
  10145. +/* @return: 0 everything is fine. Just continue processing
  10146. + * 1 subflow is broken stop everything
  10147. + * -1 this packet was broken - continue with the next one.
  10148. + */
  10149. +static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
  10150. +{
  10151. + struct tcp_sock *tp = tcp_sk(sk);
  10152. +
  10153. + /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
  10154. + if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&
  10155. + !tp->mpcb->infinite_mapping_rcv) {
  10156. + /* Remove a pure subflow-fin from the queue and increase
  10157. + * copied_seq.
  10158. + */
  10159. + tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
  10160. + __skb_unlink(skb, &sk->sk_receive_queue);
  10161. + __kfree_skb(skb);
  10162. + return -1;
  10163. + }
  10164. +
  10165. + /* If we are not yet fully established and do not know the mapping for
  10166. + * this segment, this path has to fallback to infinite or be torn down.
  10167. + */
  10168. + if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
  10169. + !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {
  10170. + pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
  10171. + __func__, tp->mpcb->mptcp_loc_token,
  10172. + tp->mptcp->path_index, __builtin_return_address(0),
  10173. + TCP_SKB_CB(skb)->seq);
  10174. +
  10175. + if (!is_master_tp(tp)) {
  10176. + mptcp_send_reset(sk);
  10177. + return 1;
  10178. + }
  10179. +
  10180. + tp->mpcb->infinite_mapping_snd = 1;
  10181. + tp->mpcb->infinite_mapping_rcv = 1;
  10182. + tp->mptcp->fully_established = 1;
  10183. + }
  10184. +
  10185. + /* Receiver-side becomes fully established when a whole rcv-window has
  10186. + * been received without the need to fallback due to the previous
  10187. + * condition. */
  10188. + if (!tp->mptcp->fully_established) {
  10189. + tp->mptcp->init_rcv_wnd -= skb->len;
  10190. + if (tp->mptcp->init_rcv_wnd < 0)
  10191. + mptcp_become_fully_estab(sk);
  10192. + }
  10193. +
  10194. + return 0;
  10195. +}
  10196. +
  10197. +/* @return: 0 everything is fine. Just continue processing
  10198. + * 1 subflow is broken stop everything
  10199. + * -1 this packet was broken - continue with the next one.
  10200. + */
  10201. +static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
  10202. +{
  10203. + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
  10204. + struct mptcp_cb *mpcb = tp->mpcb;
  10205. + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  10206. + u32 *ptr;
  10207. + u32 data_seq, sub_seq, data_len, tcp_end_seq;
  10208. +
  10209. + /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
  10210. + * in-order at the data-level. Thus data-seq-numbers can be inferred
  10211. + * from what is expected at the data-level.
  10212. + */
  10213. + if (mpcb->infinite_mapping_rcv) {
  10214. + tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);
  10215. + tp->mptcp->map_subseq = tcb->seq;
  10216. + tp->mptcp->map_data_len = skb->len;
  10217. + tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;
  10218. + tp->mptcp->mapping_present = 1;
  10219. + return 0;
  10220. + }
  10221. +
  10222. + /* No mapping here? Exit - it is either already set or still on its way */
  10223. + if (!mptcp_is_data_seq(skb)) {
  10224. + /* Too many packets without a mapping - this subflow is broken */
  10225. + if (!tp->mptcp->mapping_present &&
  10226. + tp->rcv_nxt - tp->copied_seq > 65536) {
  10227. + mptcp_send_reset(sk);
  10228. + return 1;
  10229. + }
  10230. +
  10231. + return 0;
  10232. + }
  10233. +
  10234. + ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
  10235. + ptr++;
  10236. + sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
  10237. + ptr++;
  10238. + data_len = get_unaligned_be16(ptr);
  10239. +
  10240. + /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
  10241. + * The draft sets it to 0, but we really would like to have the
  10242. + * real value, to have an easy handling afterwards here in this
  10243. + * function.
  10244. + */
  10245. + if (mptcp_is_data_fin(skb) && skb->len == 0)
  10246. + sub_seq = TCP_SKB_CB(skb)->seq;
  10247. +
  10248. + /* If there is already a mapping - we check if it maps with the current
  10249. + * one. If not - we reset.
  10250. + */
  10251. + if (tp->mptcp->mapping_present &&
  10252. + (data_seq != (u32)tp->mptcp->map_data_seq ||
  10253. + sub_seq != tp->mptcp->map_subseq ||
  10254. + data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
  10255. + mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
  10256. + /* Mapping in packet is different from what we want */
  10257. + pr_err("%s Mappings do not match!\n", __func__);
  10258. + pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
  10259. + __func__, data_seq, (u32)tp->mptcp->map_data_seq,
  10260. + sub_seq, tp->mptcp->map_subseq, data_len,
  10261. + tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
  10262. + tp->mptcp->map_data_fin);
  10263. + mptcp_send_reset(sk);
  10264. + return 1;
  10265. + }
  10266. +
  10267. + /* If the previous check was good, the current mapping is valid and we exit. */
  10268. + if (tp->mptcp->mapping_present)
  10269. + return 0;
  10270. +
  10271. + /* Mapping not yet set on this subflow - we set it here! */
  10272. +
  10273. + if (!data_len) {
  10274. + mpcb->infinite_mapping_rcv = 1;
  10275. + tp->mptcp->fully_established = 1;
  10276. + /* We need to repeat mp_fail's until the sender felt
  10277. + * back to infinite-mapping - here we stop repeating it.
  10278. + */
  10279. + tp->mptcp->send_mp_fail = 0;
  10280. +
  10281. + /* We have to fixup data_len - it must be the same as skb->len */
  10282. + data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
  10283. + sub_seq = tcb->seq;
  10284. +
  10285. + /* TODO kill all other subflows than this one */
  10286. + /* data_seq and so on are set correctly */
  10287. +
  10288. + /* At this point, the meta-ofo-queue has to be emptied,
  10289. + * as the following data is guaranteed to be in-order at
  10290. + * the data and subflow-level
  10291. + */
  10292. + mptcp_purge_ofo_queue(meta_tp);
  10293. + }
  10294. +
  10295. + /* We are sending mp-fail's and thus are in fallback mode.
  10296. + * Ignore packets which do not announce the fallback and still
  10297. + * want to provide a mapping.
  10298. + */
  10299. + if (tp->mptcp->send_mp_fail) {
  10300. + tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
  10301. + __skb_unlink(skb, &sk->sk_receive_queue);
  10302. + __kfree_skb(skb);
  10303. + return -1;
  10304. + }
  10305. +
  10306. + /* FIN increased the mapping-length by 1 */
  10307. + if (mptcp_is_data_fin(skb))
  10308. + data_len--;
  10309. +
  10310. + /* Subflow-sequences of packet must be
  10311. + * (at least partially) be part of the DSS-mapping's
  10312. + * subflow-sequence-space.
  10313. + *
  10314. + * Basically the mapping is not valid, if either of the
  10315. + * following conditions is true:
  10316. + *
  10317. + * 1. It's not a data_fin and
  10318. + * MPTCP-sub_seq >= TCP-end_seq
  10319. + *
  10320. + * 2. It's a data_fin and TCP-end_seq > TCP-seq and
  10321. + * MPTCP-sub_seq >= TCP-end_seq
  10322. + *
  10323. + * The previous two can be merged into:
  10324. + * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
  10325. + * Because if it's not a data-fin, TCP-end_seq > TCP-seq
  10326. + *
  10327. + * 3. It's a data_fin and skb->len == 0 and
  10328. + * MPTCP-sub_seq > TCP-end_seq
  10329. + *
  10330. + * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
  10331. + * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
  10332. + *
  10333. + * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)
  10334. + */
  10335. +
  10336. + /* subflow-fin is not part of the mapping - ignore it here ! */
  10337. + tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;
  10338. + if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
  10339. + (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
  10340. + (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||
  10341. + before(sub_seq, tp->copied_seq)) {
  10342. + /* Subflow-sequences of packet is different from what is in the
  10343. + * packet's dss-mapping. The peer is misbehaving - reset
  10344. + */
  10345. + pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
  10346. + "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
  10347. + "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
  10348. + skb->len, data_len, tp->copied_seq);
  10349. + mptcp_send_reset(sk);
  10350. + return 1;
  10351. + }
  10352. +
  10353. + /* Does the DSS had 64-bit seqnum's ? */
  10354. + if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
  10355. + /* Wrapped around? */
  10356. + if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
  10357. + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
  10358. + } else {
  10359. + /* Else, access the default high-order bits */
  10360. + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
  10361. + }
  10362. + } else {
  10363. + tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
  10364. +
  10365. + if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
  10366. + /* We make sure that the data_seq is invalid.
  10367. + * It will be dropped later.
  10368. + */
  10369. + tp->mptcp->map_data_seq += 0xFFFFFFFF;
  10370. + tp->mptcp->map_data_seq += 0xFFFFFFFF;
  10371. + }
  10372. + }
  10373. +
  10374. + tp->mptcp->map_data_len = data_len;
  10375. + tp->mptcp->map_subseq = sub_seq;
  10376. + tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
  10377. + tp->mptcp->mapping_present = 1;
  10378. +
  10379. + return 0;
  10380. +}
  10381. +
  10382. +/* Similar to tcp_sequence(...) */
  10383. +static inline int mptcp_sequence(const struct tcp_sock *meta_tp,
  10384. + u64 data_seq, u64 end_data_seq)
  10385. +{
  10386. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  10387. + u64 rcv_wup64;
  10388. +
  10389. + /* Wrap-around? */
  10390. + if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
  10391. + rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
  10392. + meta_tp->rcv_wup;
  10393. + } else {
  10394. + rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
  10395. + meta_tp->rcv_wup);
  10396. + }
  10397. +
  10398. + return !before64(end_data_seq, rcv_wup64) &&
  10399. + !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
  10400. +}
  10401. +
  10402. +/* @return: 0 everything is fine. Just continue processing
  10403. + * -1 this packet was broken - continue with the next one.
  10404. + */
  10405. +static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
  10406. +{
  10407. + struct tcp_sock *tp = tcp_sk(sk);
  10408. + struct sk_buff *tmp, *tmp1;
  10409. + u32 tcp_end_seq;
  10410. +
  10411. + if (!tp->mptcp->mapping_present)
  10412. + return 0;
  10413. +
  10414. + /* either, the new skb gave us the mapping and the first segment
  10415. + * in the sub-rcv-queue has to be trimmed ...
  10416. + */
  10417. + tmp = skb_peek(&sk->sk_receive_queue);
  10418. + if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
  10419. + after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))
  10420. + mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
  10421. +
  10422. + /* ... or the new skb (tail) has to be split at the end. */
  10423. + tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);
  10424. + if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
  10425. + u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
  10426. + if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
  10427. + /* TODO : maybe handle this here better.
  10428. + * We now just force meta-retransmission.
  10429. + */
  10430. + tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
  10431. + __skb_unlink(skb, &sk->sk_receive_queue);
  10432. + __kfree_skb(skb);
  10433. + return -1;
  10434. + }
  10435. + }
  10436. +
  10437. + /* Now, remove old sk_buff's from the receive-queue.
  10438. + * This may happen if the mapping has been lost for these segments and
  10439. + * the next mapping has already been received.
  10440. + */
  10441. + if (tp->mptcp->mapping_present &&
  10442. + before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
  10443. + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
  10444. + if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
  10445. + break;
  10446. +
  10447. + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
  10448. + __skb_unlink(tmp1, &sk->sk_receive_queue);
  10449. +
  10450. + /* Impossible that we could free skb here, because his
  10451. + * mapping is known to be valid from previous checks
  10452. + */
  10453. + __kfree_skb(tmp1);
  10454. + }
  10455. + }
  10456. +
  10457. + return 0;
  10458. +}
  10459. +
  10460. +/* @return: 0 everything is fine. Just continue processing
  10461. + * 1 subflow is broken stop everything
  10462. + * -1 this mapping has been put in the meta-receive-queue
  10463. + * -2 this mapping has been eaten by the application
  10464. + */
  10465. +static int mptcp_queue_skb(struct sock *sk)
  10466. +{
  10467. + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
  10468. + struct sock *meta_sk = mptcp_meta_sk(sk);
  10469. + struct mptcp_cb *mpcb = tp->mpcb;
  10470. + struct sk_buff *tmp, *tmp1;
  10471. + u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
  10472. + bool data_queued = false;
  10473. +
  10474. + /* Have we not yet received the full mapping? */
  10475. + if (!tp->mptcp->mapping_present ||
  10476. + before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
  10477. + return 0;
  10478. +
  10479. + /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
  10480. + * OR
  10481. + * This mapping is out of window
  10482. + */
  10483. + if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
  10484. + !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
  10485. + tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
  10486. + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
  10487. + __skb_unlink(tmp1, &sk->sk_receive_queue);
  10488. + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
  10489. + __kfree_skb(tmp1);
  10490. +
  10491. + if (!skb_queue_empty(&sk->sk_receive_queue) &&
  10492. + !before(TCP_SKB_CB(tmp)->seq,
  10493. + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
  10494. + break;
  10495. + }
  10496. +
  10497. + mptcp_reset_mapping(tp);
  10498. +
  10499. + return -1;
  10500. + }
  10501. +
  10502. + /* Record it, because we want to send our data_fin on the same path */
  10503. + if (tp->mptcp->map_data_fin) {
  10504. + mpcb->dfin_path_index = tp->mptcp->path_index;
  10505. + mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
  10506. + }
  10507. +
  10508. + /* Verify the checksum */
  10509. + if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
  10510. + int ret = mptcp_verif_dss_csum(sk);
  10511. +
  10512. + if (ret <= 0) {
  10513. + mptcp_reset_mapping(tp);
  10514. + return 1;
  10515. + }
  10516. + }
  10517. +
  10518. + if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
  10519. + /* Seg's have to go to the meta-ofo-queue */
  10520. + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
  10521. + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
  10522. + mptcp_prepare_skb(tmp1, tmp, sk);
  10523. + __skb_unlink(tmp1, &sk->sk_receive_queue);
  10524. + /* MUST be done here, because fragstolen may be true later.
  10525. + * Then, kfree_skb_partial will not account the memory.
  10526. + */
  10527. + skb_orphan(tmp1);
  10528. +
  10529. + if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
  10530. + mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);
  10531. + else
  10532. + __kfree_skb(tmp1);
  10533. +
  10534. + if (!skb_queue_empty(&sk->sk_receive_queue) &&
  10535. + !before(TCP_SKB_CB(tmp)->seq,
  10536. + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
  10537. + break;
  10538. +
  10539. + }
  10540. + } else {
  10541. + /* Ready for the meta-rcv-queue */
  10542. + skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
  10543. + int eaten = 0;
  10544. + int copied_early = 0;
  10545. + bool fragstolen = false;
  10546. + u32 old_rcv_nxt = meta_tp->rcv_nxt;
  10547. +
  10548. + tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
  10549. + mptcp_prepare_skb(tmp1, tmp, sk);
  10550. + __skb_unlink(tmp1, &sk->sk_receive_queue);
  10551. + /* MUST be done here, because fragstolen may be true.
  10552. + * Then, kfree_skb_partial will not account the memory.
  10553. + */
  10554. + skb_orphan(tmp1);
  10555. +
  10556. + /* This segment has already been received */
  10557. + if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
  10558. + __kfree_skb(tmp1);
  10559. + goto next;
  10560. + }
  10561. +
  10562. +#ifdef CONFIG_NET_DMA
  10563. + if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
  10564. + meta_tp->ucopy.task == current &&
  10565. + meta_tp->copied_seq == meta_tp->rcv_nxt &&
  10566. + tmp1->len <= meta_tp->ucopy.len &&
  10567. + sock_owned_by_user(meta_sk) &&
  10568. + tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {
  10569. + copied_early = 1;
  10570. + eaten = 1;
  10571. + }
  10572. +#endif
  10573. +
  10574. + /* Is direct copy possible ? */
  10575. + if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
  10576. + meta_tp->ucopy.task == current &&
  10577. + meta_tp->copied_seq == meta_tp->rcv_nxt &&
  10578. + meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&
  10579. + !copied_early)
  10580. + eaten = mptcp_direct_copy(tmp1, meta_sk);
  10581. +
  10582. + if (mpcb->in_time_wait) /* In time-wait, do not receive data */
  10583. + eaten = 1;
  10584. +
  10585. + if (!eaten)
  10586. + eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
  10587. +
  10588. + meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
  10589. + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
  10590. +
  10591. + if (copied_early)
  10592. + tcp_cleanup_rbuf(meta_sk, tmp1->len);
  10593. +
  10594. + if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)
  10595. + mptcp_fin(meta_sk);
  10596. +
  10597. + /* Check if this fills a gap in the ofo queue */
  10598. + if (!skb_queue_empty(&meta_tp->out_of_order_queue))
  10599. + mptcp_ofo_queue(meta_sk);
  10600. +
  10601. +#ifdef CONFIG_NET_DMA
  10602. + if (copied_early)
  10603. + __skb_queue_tail(&meta_sk->sk_async_wait_queue,
  10604. + tmp1);
  10605. + else
  10606. +#endif
  10607. + if (eaten)
  10608. + kfree_skb_partial(tmp1, fragstolen);
  10609. +
  10610. + data_queued = true;
  10611. +next:
  10612. + if (!skb_queue_empty(&sk->sk_receive_queue) &&
  10613. + !before(TCP_SKB_CB(tmp)->seq,
  10614. + tp->mptcp->map_subseq + tp->mptcp->map_data_len))
  10615. + break;
  10616. + }
  10617. + }
  10618. +
  10619. + inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;
  10620. + tp->mptcp->last_data_seq = tp->mptcp->map_data_seq;
  10621. + mptcp_reset_mapping(tp);
  10622. +
  10623. + return data_queued ? -1 : -2;
  10624. +}
  10625. +
  10626. +void mptcp_data_ready(struct sock *sk, int bytes)
  10627. +{
  10628. + struct sock *meta_sk = mptcp_meta_sk(sk);
  10629. + struct sk_buff *skb, *tmp;
  10630. + int queued = 0;
  10631. +
  10632. + /* If the meta is already closed, there is no point in pushing data */
  10633. + if (meta_sk->sk_state == TCP_CLOSE && !tcp_sk(sk)->mpcb->in_time_wait) {
  10634. + skb_queue_purge(&sk->sk_receive_queue);
  10635. + tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
  10636. + goto exit;
  10637. + }
  10638. +
  10639. +restart:
  10640. + /* Iterate over all segments, detect their mapping (if we don't have
  10641. + * one yet), validate them and push everything one level higher.
  10642. + */
  10643. + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
  10644. + int ret;
  10645. + /* Pre-validation - e.g., early fallback */
  10646. + ret = mptcp_prevalidate_skb(sk, skb);
  10647. + if (ret < 0)
  10648. + goto restart;
  10649. + else if (ret > 0)
  10650. + break;
  10651. +
  10652. + /* Set the current mapping */
  10653. + ret = mptcp_detect_mapping(sk, skb);
  10654. + if (ret < 0)
  10655. + goto restart;
  10656. + else if (ret > 0)
  10657. + break;
  10658. +
  10659. + /* Validation */
  10660. + if (mptcp_validate_mapping(sk, skb) < 0)
  10661. + goto restart;
  10662. +
  10663. + /* Push a level higher */
  10664. + ret = mptcp_queue_skb(sk);
  10665. + if (ret < 0) {
  10666. + if (ret == -1)
  10667. + queued = ret;
  10668. + goto restart;
  10669. + } else if (ret == 0) {
  10670. + continue;
  10671. + } else { /* ret == 1 */
  10672. + break;
  10673. + }
  10674. + }
  10675. +
  10676. +exit:
  10677. + if (tcp_sk(sk)->close_it) {
  10678. + tcp_send_ack(sk);
  10679. + tcp_time_wait(sk, TCP_TIME_WAIT, 0);
  10680. + }
  10681. +
  10682. + if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
  10683. + meta_sk->sk_data_ready(meta_sk, 0);
  10684. +}
  10685. +
  10686. +
  10687. +int mptcp_check_req(struct sk_buff *skb, struct net *net)
  10688. +{
  10689. + struct tcphdr *th = tcp_hdr(skb);
  10690. + struct sock *meta_sk = NULL;
  10691. +
  10692. + /* MPTCP structures not initialized */
  10693. + if (mptcp_init_failed)
  10694. + return 0;
  10695. +
  10696. + if (skb->protocol == htons(ETH_P_IP))
  10697. + meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,
  10698. + ip_hdr(skb)->daddr, net);
  10699. +#if IS_ENABLED(CONFIG_IPV6)
  10700. + else /* IPv6 */
  10701. + meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,
  10702. + &ipv6_hdr(skb)->daddr, net);
  10703. +#endif /* CONFIG_IPV6 */
  10704. +
  10705. + if (!meta_sk)
  10706. + return 0;
  10707. +
  10708. + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
  10709. +
  10710. + bh_lock_sock_nested(meta_sk);
  10711. + if (sock_owned_by_user(meta_sk)) {
  10712. + skb->sk = meta_sk;
  10713. + if (unlikely(sk_add_backlog(meta_sk, skb,
  10714. + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
  10715. + bh_unlock_sock(meta_sk);
  10716. + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  10717. + sock_put(meta_sk); /* Taken by mptcp_search_req */
  10718. + kfree_skb(skb);
  10719. + return 1;
  10720. + }
  10721. + } else if (skb->protocol == htons(ETH_P_IP)) {
  10722. + tcp_v4_do_rcv(meta_sk, skb);
  10723. +#if IS_ENABLED(CONFIG_IPV6)
  10724. + } else { /* IPv6 */
  10725. + tcp_v6_do_rcv(meta_sk, skb);
  10726. +#endif /* CONFIG_IPV6 */
  10727. + }
  10728. + bh_unlock_sock(meta_sk);
  10729. + sock_put(meta_sk); /* Taken by mptcp_vX_search_req */
  10730. + return 1;
  10731. +}
  10732. +
  10733. +struct mp_join *mptcp_find_join(struct sk_buff *skb)
  10734. +{
  10735. + struct tcphdr *th = tcp_hdr(skb);
  10736. + unsigned char *ptr;
  10737. + int length = (th->doff * 4) - sizeof(struct tcphdr);
  10738. +
  10739. + /* Jump through the options to check whether JOIN is there */
  10740. + ptr = (unsigned char *)(th + 1);
  10741. + while (length > 0) {
  10742. + int opcode = *ptr++;
  10743. + int opsize;
  10744. +
  10745. + switch (opcode) {
  10746. + case TCPOPT_EOL:
  10747. + return NULL;
  10748. + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
  10749. + length--;
  10750. + continue;
  10751. + default:
  10752. + opsize = *ptr++;
  10753. + if (opsize < 2) /* "silly options" */
  10754. + return NULL;
  10755. + if (opsize > length)
  10756. + return NULL; /* don't parse partial options */
  10757. + if (opcode == TCPOPT_MPTCP &&
  10758. + ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
  10759. + return (struct mp_join *)(ptr - 2);
  10760. + }
  10761. + ptr += opsize - 2;
  10762. + length -= opsize;
  10763. + }
  10764. + }
  10765. + return NULL;
  10766. +}
  10767. +
  10768. +int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
  10769. +{
  10770. + struct mptcp_cb *mpcb;
  10771. + struct sock *meta_sk;
  10772. + u32 token;
  10773. + struct mp_join *join_opt = mptcp_find_join(skb);
  10774. + if (!join_opt)
  10775. + return 0;
  10776. +
  10777. + /* MPTCP structures were not initialized, so return error */
  10778. + if (mptcp_init_failed)
  10779. + return -1;
  10780. +
  10781. + token = join_opt->u.syn.token;
  10782. + meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
  10783. + if (!meta_sk) {
  10784. + mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
  10785. + return -1;
  10786. + }
  10787. +
  10788. + mpcb = tcp_sk(meta_sk)->mpcb;
  10789. + if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
  10790. + /* We are in fallback-mode on the reception-side -
  10791. + * no new subflows!
  10792. + */
  10793. + sock_put(meta_sk); /* Taken by mptcp_hash_find */
  10794. + return -1;
  10795. + }
  10796. +
  10797. + /* Coming from time-wait-sock processing in tcp_v4_rcv.
  10798. + * We have to deschedule it before continuing, because otherwise
  10799. + * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
  10800. + */
  10801. + if (tw) {
  10802. + inet_twsk_deschedule(tw, &tcp_death_row);
  10803. + inet_twsk_put(tw);
  10804. + }
  10805. +
  10806. + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
  10807. + /* OK, this is a new syn/join, let's create a new open request and
  10808. + * send syn+ack
  10809. + */
  10810. + bh_lock_sock_nested(meta_sk);
  10811. + if (sock_owned_by_user(meta_sk)) {
  10812. + skb->sk = meta_sk;
  10813. + if (unlikely(sk_add_backlog(meta_sk, skb,
  10814. + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
  10815. + bh_unlock_sock(meta_sk);
  10816. + NET_INC_STATS_BH(sock_net(meta_sk),
  10817. + LINUX_MIB_TCPBACKLOGDROP);
  10818. + sock_put(meta_sk); /* Taken by mptcp_hash_find */
  10819. + kfree_skb(skb);
  10820. + return 1;
  10821. + }
  10822. + } else if (skb->protocol == htons(ETH_P_IP)) {
  10823. + tcp_v4_do_rcv(meta_sk, skb);
  10824. +#if IS_ENABLED(CONFIG_IPV6)
  10825. + } else {
  10826. + tcp_v6_do_rcv(meta_sk, skb);
  10827. +#endif /* CONFIG_IPV6 */
  10828. + }
  10829. + bh_unlock_sock(meta_sk);
  10830. + sock_put(meta_sk); /* Taken by mptcp_hash_find */
  10831. + return 1;
  10832. +}
  10833. +
  10834. +int mptcp_do_join_short(struct sk_buff *skb, struct mptcp_options_received *mopt,
  10835. + struct tcp_options_received *tmp_opt, struct net *net)
  10836. +{
  10837. + struct sock *meta_sk;
  10838. + u32 token;
  10839. +
  10840. + token = mopt->mptcp_rem_token;
  10841. + meta_sk = mptcp_hash_find(net, token);
  10842. + if (!meta_sk) {
  10843. + mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
  10844. + return -1;
  10845. + }
  10846. +
  10847. + TCP_SKB_CB(skb)->mptcp_flags = MPTCPHDR_JOIN;
  10848. +
  10849. + /* OK, this is a new syn/join, let's create a new open request and
  10850. + * send syn+ack
  10851. + */
  10852. + bh_lock_sock(meta_sk);
  10853. +
  10854. + /* This check is also done in mptcp_vX_do_rcv. But, there we cannot
  10855. + * call tcp_vX_send_reset, because we hold already two socket-locks.
  10856. + * (the listener and the meta from above)
  10857. + *
  10858. + * And the send-reset will try to take yet another one (ip_send_reply).
  10859. + * Thus, we propagate the reset up to tcp_rcv_state_process.
  10860. + */
  10861. + if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
  10862. + tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
  10863. + meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
  10864. + bh_unlock_sock(meta_sk);
  10865. + sock_put(meta_sk); /* Taken by mptcp_hash_find */
  10866. + return -1;
  10867. + }
  10868. +
  10869. + if (sock_owned_by_user(meta_sk)) {
  10870. + skb->sk = meta_sk;
  10871. + if (unlikely(sk_add_backlog(meta_sk, skb,
  10872. + meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
  10873. + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  10874. + else
  10875. + /* Must make sure that upper layers won't free the
  10876. + * skb if it is added to the backlog-queue.
  10877. + */
  10878. + skb_get(skb);
  10879. + } else {
  10880. + /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
  10881. + * the skb will finally be freed by tcp_v4_do_rcv (where we are
  10882. + * coming from)
  10883. + */
  10884. + skb_get(skb);
  10885. + if (skb->protocol == htons(ETH_P_IP)) {
  10886. + tcp_v4_do_rcv(meta_sk, skb);
  10887. +#if IS_ENABLED(CONFIG_IPV6)
  10888. + } else { /* IPv6 */
  10889. + tcp_v6_do_rcv(meta_sk, skb);
  10890. +#endif /* CONFIG_IPV6 */
  10891. + }
  10892. + }
  10893. +
  10894. + bh_unlock_sock(meta_sk);
  10895. + sock_put(meta_sk); /* Taken by mptcp_hash_find */
  10896. + return 0;
  10897. +}
  10898. +
  10899. +/**
  10900. + * Equivalent of tcp_fin() for MPTCP
  10901. + * Can be called only when the FIN is validly part
  10902. + * of the data seqnum space. Not before when we get holes.
  10903. + */
  10904. +void mptcp_fin(struct sock *meta_sk)
  10905. +{
  10906. + struct sock *sk = NULL, *sk_it;
  10907. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  10908. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  10909. +
  10910. + mptcp_for_each_sk(mpcb, sk_it) {
  10911. + if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
  10912. + sk = sk_it;
  10913. + break;
  10914. + }
  10915. + }
  10916. +
  10917. + if (!sk || sk->sk_state == TCP_CLOSE)
  10918. + sk = mptcp_select_ack_sock(meta_sk, 0);
  10919. +
  10920. + inet_csk_schedule_ack(sk);
  10921. +
  10922. + meta_sk->sk_shutdown |= RCV_SHUTDOWN;
  10923. + sock_set_flag(meta_sk, SOCK_DONE);
  10924. +
  10925. + switch (meta_sk->sk_state) {
  10926. + case TCP_SYN_RECV:
  10927. + case TCP_ESTABLISHED:
  10928. + /* Move to CLOSE_WAIT */
  10929. + tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
  10930. + inet_csk(sk)->icsk_ack.pingpong = 1;
  10931. + break;
  10932. +
  10933. + case TCP_CLOSE_WAIT:
  10934. + case TCP_CLOSING:
  10935. + /* Received a retransmission of the FIN, do
  10936. + * nothing.
  10937. + */
  10938. + break;
  10939. + case TCP_LAST_ACK:
  10940. + /* RFC793: Remain in the LAST-ACK state. */
  10941. + break;
  10942. +
  10943. + case TCP_FIN_WAIT1:
  10944. + /* This case occurs when a simultaneous close
  10945. + * happens, we must ack the received FIN and
  10946. + * enter the CLOSING state.
  10947. + */
  10948. + tcp_send_ack(sk);
  10949. + tcp_set_state(meta_sk, TCP_CLOSING);
  10950. + break;
  10951. + case TCP_FIN_WAIT2:
  10952. + /* Received a FIN -- send ACK and enter TIME_WAIT. */
  10953. + tcp_send_ack(sk);
  10954. + tcp_time_wait(meta_sk, TCP_TIME_WAIT, 0);
  10955. + break;
  10956. + default:
  10957. + /* Only TCP_LISTEN and TCP_CLOSE are left, in these
  10958. + * cases we should never reach this piece of code.
  10959. + */
  10960. + pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
  10961. + meta_sk->sk_state);
  10962. + break;
  10963. + }
  10964. +
  10965. + /* It _is_ possible, that we have something out-of-order _after_ FIN.
  10966. + * Probably, we should reset in this case. For now drop them.
  10967. + */
  10968. + mptcp_purge_ofo_queue(meta_tp);
  10969. + sk_mem_reclaim(meta_sk);
  10970. +
  10971. + if (!sock_flag(meta_sk, SOCK_DEAD)) {
  10972. + meta_sk->sk_state_change(meta_sk);
  10973. +
  10974. + /* Do not send POLL_HUP for half duplex close. */
  10975. + if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
  10976. + meta_sk->sk_state == TCP_CLOSE)
  10977. + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
  10978. + else
  10979. + sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
  10980. + }
  10981. +
  10982. + return;
  10983. +}
  10984. +
  10985. +static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
  10986. +{
  10987. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  10988. + struct sk_buff *skb;
  10989. +
  10990. + if (!meta_tp->packets_out)
  10991. + return;
  10992. +
  10993. + tcp_for_write_queue(skb, meta_sk) {
  10994. + if (skb == tcp_send_head(meta_sk))
  10995. + break;
  10996. +
  10997. + if (mptcp_retransmit_skb(meta_sk, skb))
  10998. + return;
  10999. +
  11000. + if (skb == tcp_write_queue_head(meta_sk))
  11001. + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
  11002. + inet_csk(meta_sk)->icsk_rto,
  11003. + TCP_RTO_MAX);
  11004. + }
  11005. +}
  11006. +
  11007. +/* Handle the DATA_ACK */
  11008. +static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
  11009. +{
  11010. + struct sock *meta_sk = mptcp_meta_sk(sk);
  11011. + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
  11012. + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  11013. + u32 prior_snd_una = meta_tp->snd_una;
  11014. + int prior_packets;
  11015. + u32 nwin, data_ack, data_seq;
  11016. + u16 data_len = 0;
  11017. +
  11018. + /* A valid packet came in - subflow is operational again */
  11019. + tp->pf = 0;
  11020. +
  11021. + /* Even if there is no data-ack, we stop retransmitting.
  11022. + * Except if this is a SYN/ACK. Then it is just a retransmission
  11023. + */
  11024. + if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
  11025. + tp->mptcp->pre_established = 0;
  11026. + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
  11027. + }
  11028. +
  11029. + /* If we are in infinite mapping mode, rx_opt.data_ack has been
  11030. + * set by mptcp_clean_rtx_infinite.
  11031. + */
  11032. + if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
  11033. + goto exit;
  11034. +
  11035. + data_ack = tp->mptcp->rx_opt.data_ack;
  11036. +
  11037. + if (unlikely(!tp->mptcp->fully_established) &&
  11038. + (data_ack != meta_tp->mptcp->snt_isn ||
  11039. + tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq))
  11040. + /* As soon as data has been data-acked,
  11041. + * or a subflow-data-ack (not acking syn - thus snt_isn + 1)
  11042. + * includes a data-ack, we are fully established
  11043. + */
  11044. + mptcp_become_fully_estab(sk);
  11045. +
  11046. + /* Get the data_seq */
  11047. + if (mptcp_is_data_seq(skb)) {
  11048. + data_seq = tp->mptcp->rx_opt.data_seq;
  11049. + data_len = tp->mptcp->rx_opt.data_len;
  11050. + } else {
  11051. + data_seq = meta_tp->snd_wl1;
  11052. + }
  11053. +
  11054. + /* If the ack is older than previous acks
  11055. + * then we can probably ignore it.
  11056. + */
  11057. + if (before(data_ack, prior_snd_una))
  11058. + goto exit;
  11059. +
  11060. + /* If the ack includes data we haven't sent yet, discard
  11061. + * this segment (RFC793 Section 3.9).
  11062. + */
  11063. + if (after(data_ack, meta_tp->snd_nxt))
  11064. + goto exit;
  11065. +
  11066. + /*** Now, update the window - inspired by tcp_ack_update_window ***/
  11067. + nwin = ntohs(tcp_hdr(skb)->window);
  11068. +
  11069. + if (likely(!tcp_hdr(skb)->syn))
  11070. + nwin <<= tp->rx_opt.snd_wscale;
  11071. +
  11072. + if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
  11073. + tcp_update_wl(meta_tp, data_seq);
  11074. +
  11075. + /* Draft v09, Section 3.3.5:
  11076. + * [...] It should only update its local receive window values
  11077. + * when the largest sequence number allowed (i.e. DATA_ACK +
  11078. + * receive window) increases. [...]
  11079. + */
  11080. + if (meta_tp->snd_wnd != nwin &&
  11081. + !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
  11082. + meta_tp->snd_wnd = nwin;
  11083. +
  11084. + if (nwin > meta_tp->max_window)
  11085. + meta_tp->max_window = nwin;
  11086. + }
  11087. + }
  11088. + /*** Done, update the window ***/
  11089. +
  11090. + /* We passed data and got it acked, remove any soft error
  11091. + * log. Something worked...
  11092. + */
  11093. + sk->sk_err_soft = 0;
  11094. + inet_csk(meta_sk)->icsk_probes_out = 0;
  11095. + meta_tp->rcv_tstamp = tcp_time_stamp;
  11096. + prior_packets = meta_tp->packets_out;
  11097. + if (!prior_packets)
  11098. + goto no_queue;
  11099. +
  11100. + meta_tp->snd_una = data_ack;
  11101. +
  11102. + mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
  11103. +
  11104. + /* We are in loss-state, and something got acked, retransmit the whole
  11105. + * queue now!
  11106. + */
  11107. + if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
  11108. + after(data_ack, prior_snd_una)) {
  11109. + mptcp_xmit_retransmit_queue(meta_sk);
  11110. + inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
  11111. + }
  11112. +
  11113. + /* Simplified version of tcp_new_space, because the snd-buffer
  11114. + * is handled by all the subflows.
  11115. + */
  11116. + if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
  11117. + sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
  11118. + if (meta_sk->sk_socket &&
  11119. + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
  11120. + meta_sk->sk_write_space(meta_sk);
  11121. + }
  11122. +
  11123. + if (meta_sk->sk_state != TCP_ESTABLISHED &&
  11124. + mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))
  11125. + return;
  11126. +
  11127. +exit:
  11128. + mptcp_push_pending_frames(meta_sk);
  11129. +
  11130. + return;
  11131. +
  11132. +no_queue:
  11133. + if (tcp_send_head(meta_sk))
  11134. + tcp_ack_probe(meta_sk);
  11135. +
  11136. + mptcp_push_pending_frames(meta_sk);
  11137. +
  11138. + return;
  11139. +}
  11140. +
  11141. +void mptcp_clean_rtx_infinite(struct sk_buff *skb, struct sock *sk)
  11142. +{
  11143. + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
  11144. +
  11145. + if (!tp->mpcb->infinite_mapping_snd)
  11146. + return;
  11147. +
  11148. + /* The difference between both write_seq's represents the offset between
  11149. + * data-sequence and subflow-sequence. As we are infinite, this must
  11150. + * match.
  11151. + *
  11152. + * Thus, from this difference we can infer the meta snd_una.
  11153. + */
  11154. + tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
  11155. + tp->snd_una;
  11156. +
  11157. + mptcp_data_ack(sk, skb);
  11158. +}
  11159. +
  11160. +/**** static functions used by mptcp_parse_options */
  11161. +
  11162. +static inline int mptcp_rem_raddress(struct mptcp_cb *mpcb, u8 rem_id)
  11163. +{
  11164. + if (mptcp_v4_rem_raddress(mpcb, rem_id) < 0) {
  11165. +#if IS_ENABLED(CONFIG_IPV6)
  11166. + if (mptcp_v6_rem_raddress(mpcb, rem_id) < 0)
  11167. + return -1;
  11168. +#else
  11169. + return -1;
  11170. +#endif /* CONFIG_IPV6 */
  11171. + }
  11172. + return 0;
  11173. +}
  11174. +
  11175. +static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
  11176. +{
  11177. + struct sock *sk_it, *tmpsk;
  11178. +
  11179. + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
  11180. + if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
  11181. + mptcp_reinject_data(sk_it, 0);
  11182. + sk_it->sk_err = ECONNRESET;
  11183. + if (tcp_need_reset(sk_it->sk_state))
  11184. + tcp_send_active_reset(sk_it, GFP_ATOMIC);
  11185. + mptcp_sub_force_close(sk_it);
  11186. + }
  11187. + }
  11188. +}
  11189. +
  11190. +void mptcp_parse_options(const uint8_t *ptr, int opsize,
  11191. + struct tcp_options_received *opt_rx,
  11192. + struct mptcp_options_received *mopt,
  11193. + const struct sk_buff *skb)
  11194. +{
  11195. + struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
  11196. +
  11197. + /* If the socket is mp-capable we would have a mopt. */
  11198. + if (!mopt)
  11199. + return;
  11200. +
  11201. + switch (mp_opt->sub) {
  11202. + case MPTCP_SUB_CAPABLE:
  11203. + {
  11204. + struct mp_capable *mpcapable = (struct mp_capable *)ptr;
  11205. +
  11206. + if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
  11207. + opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
  11208. + mptcp_debug("%s: mp_capable: bad option size %d\n",
  11209. + __func__, opsize);
  11210. + break;
  11211. + }
  11212. +
  11213. + if (!sysctl_mptcp_enabled)
  11214. + break;
  11215. +
  11216. + /* We only support MPTCP version 0 */
  11217. + if (mpcapable->ver != 0)
  11218. + break;
  11219. +
  11220. + /* MPTCP-RFC 6824:
  11221. + * "If receiving a message with the 'B' flag set to 1, and this
  11222. + * is not understood, then this SYN MUST be silently ignored;
  11223. + */
  11224. + if (mpcapable->b) {
  11225. + mopt->drop_me = 1;
  11226. + break;
  11227. + }
  11228. +
  11229. + /* MPTCP-RFC 6824:
  11230. + * "An implementation that only supports this method MUST set
  11231. + * bit "H" to 1, and bits "C" through "G" to 0."
  11232. + */
  11233. + if (!mpcapable->h)
  11234. + break;
  11235. +
  11236. + mopt->saw_mpc = 1;
  11237. + mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
  11238. +
  11239. + if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
  11240. + mopt->mptcp_key = mpcapable->sender_key;
  11241. +
  11242. + break;
  11243. + }
  11244. + case MPTCP_SUB_JOIN:
  11245. + {
  11246. + struct mp_join *mpjoin = (struct mp_join *)ptr;
  11247. +
  11248. + if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
  11249. + opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
  11250. + opsize != MPTCP_SUB_LEN_JOIN_ACK) {
  11251. + mptcp_debug("%s: mp_join: bad option size %d\n",
  11252. + __func__, opsize);
  11253. + break;
  11254. + }
  11255. +
  11256. + /* saw_mpc must be set, because in tcp_check_req we assume that
  11257. + * it is set to support falling back to reg. TCP if a rexmitted
  11258. + * SYN has no MP_CAPABLE or MP_JOIN
  11259. + */
  11260. + switch (opsize) {
  11261. + case MPTCP_SUB_LEN_JOIN_SYN:
  11262. + mopt->is_mp_join = 1;
  11263. + mopt->saw_mpc = 1;
  11264. + mopt->low_prio = mpjoin->b;
  11265. + mopt->rem_id = mpjoin->addr_id;
  11266. + mopt->mptcp_rem_token = mpjoin->u.syn.token;
  11267. + mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
  11268. + break;
  11269. + case MPTCP_SUB_LEN_JOIN_SYNACK:
  11270. + mopt->saw_mpc = 1;
  11271. + mopt->low_prio = mpjoin->b;
  11272. + mopt->rem_id = mpjoin->addr_id;
  11273. + mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
  11274. + mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
  11275. + break;
  11276. + case MPTCP_SUB_LEN_JOIN_ACK:
  11277. + mopt->saw_mpc = 1;
  11278. + mopt->join_ack = 1;
  11279. + memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
  11280. + break;
  11281. + }
  11282. + break;
  11283. + }
  11284. + case MPTCP_SUB_DSS:
  11285. + {
  11286. + struct mp_dss *mdss = (struct mp_dss *)ptr;
  11287. + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  11288. +
  11289. + /* We check opsize for the csum and non-csum case. We do this,
  11290. + * because the draft says that the csum SHOULD be ignored if
  11291. + * it has not been negotiated in the MP_CAPABLE but still is
  11292. + * present in the data.
  11293. + *
  11294. + * It will get ignored later in mptcp_queue_skb.
  11295. + */
  11296. + if (opsize != mptcp_sub_len_dss(mdss, 0) &&
  11297. + opsize != mptcp_sub_len_dss(mdss, 1)) {
  11298. + mptcp_debug("%s: mp_dss: bad option size %d\n",
  11299. + __func__, opsize);
  11300. + break;
  11301. + }
  11302. +
  11303. + ptr += 4;
  11304. +
  11305. + if (mdss->A) {
  11306. + tcb->mptcp_flags |= MPTCPHDR_ACK;
  11307. +
  11308. + if (mdss->a) {
  11309. + mopt->data_ack = (u32) get_unaligned_be64(ptr);
  11310. + ptr += MPTCP_SUB_LEN_ACK_64;
  11311. + } else {
  11312. + mopt->data_ack = get_unaligned_be32(ptr);
  11313. + ptr += MPTCP_SUB_LEN_ACK;
  11314. + }
  11315. + }
  11316. +
  11317. + tcb->dss_off = (ptr - skb_transport_header(skb));
  11318. +
  11319. + if (mdss->M) {
  11320. + if (mdss->m) {
  11321. + u64 data_seq64 = get_unaligned_be64(ptr);
  11322. +
  11323. + tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
  11324. + mopt->data_seq = (u32) data_seq64;
  11325. +
  11326. + ptr += 12; /* 64-bit dseq + subseq */
  11327. + } else {
  11328. + mopt->data_seq = get_unaligned_be32(ptr);
  11329. + ptr += 8; /* 32-bit dseq + subseq */
  11330. + }
  11331. + mopt->data_len = get_unaligned_be16(ptr);
  11332. +
  11333. + tcb->mptcp_flags |= MPTCPHDR_SEQ;
  11334. +
  11335. + /* Is a check-sum present? */
  11336. + if (opsize == mptcp_sub_len_dss(mdss, 1))
  11337. + tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
  11338. +
  11339. + /* DATA_FIN only possible with DSS-mapping */
  11340. + if (mdss->F)
  11341. + tcb->mptcp_flags |= MPTCPHDR_FIN;
  11342. + }
  11343. +
  11344. + break;
  11345. + }
  11346. + case MPTCP_SUB_ADD_ADDR:
  11347. + {
  11348. +#if IS_ENABLED(CONFIG_IPV6)
  11349. + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
  11350. +
  11351. + if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
  11352. + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
  11353. + (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
  11354. + opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {
  11355. +#else
  11356. + if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
  11357. + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {
  11358. +#endif /* CONFIG_IPV6 */
  11359. + mptcp_debug("%s: mp_add_addr: bad option size %d\n",
  11360. + __func__, opsize);
  11361. + break;
  11362. + }
  11363. +
  11364. + /* We have to manually parse the options if we got two of them. */
  11365. + if (mopt->saw_add_addr) {
  11366. + mopt->more_add_addr = 1;
  11367. + break;
  11368. + }
  11369. + mopt->saw_add_addr = 1;
  11370. + mopt->add_addr_ptr = ptr;
  11371. + break;
  11372. + }
  11373. + case MPTCP_SUB_REMOVE_ADDR:
  11374. + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
  11375. + mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
  11376. + __func__, opsize);
  11377. + break;
  11378. + }
  11379. +
  11380. + if (mopt->saw_rem_addr) {
  11381. + mopt->more_rem_addr = 1;
  11382. + break;
  11383. + }
  11384. + mopt->saw_rem_addr = 1;
  11385. + mopt->rem_addr_ptr = ptr;
  11386. + break;
  11387. + case MPTCP_SUB_PRIO:
  11388. + {
  11389. + struct mp_prio *mpprio = (struct mp_prio *)ptr;
  11390. +
  11391. + if (opsize != MPTCP_SUB_LEN_PRIO &&
  11392. + opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
  11393. + mptcp_debug("%s: mp_prio: bad option size %d\n",
  11394. + __func__, opsize);
  11395. + break;
  11396. + }
  11397. +
  11398. + mopt->saw_low_prio = 1;
  11399. + mopt->low_prio = mpprio->b;
  11400. +
  11401. + if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
  11402. + mopt->saw_low_prio = 2;
  11403. + mopt->prio_addr_id = mpprio->addr_id;
  11404. + }
  11405. + break;
  11406. + }
  11407. + case MPTCP_SUB_FAIL:
  11408. + if (opsize != MPTCP_SUB_LEN_FAIL) {
  11409. + mptcp_debug("%s: mp_fail: bad option size %d\n",
  11410. + __func__, opsize);
  11411. + break;
  11412. + }
  11413. + mopt->mp_fail = 1;
  11414. + break;
  11415. + case MPTCP_SUB_FCLOSE:
  11416. + if (opsize != MPTCP_SUB_LEN_FCLOSE) {
  11417. + mptcp_debug("%s: mp_fclose: bad option size %d\n",
  11418. + __func__, opsize);
  11419. + break;
  11420. + }
  11421. +
  11422. + mopt->mp_fclose = 1;
  11423. + mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;
  11424. +
  11425. + break;
  11426. + default:
  11427. + mptcp_debug("%s: Received unkown subtype: %d\n",
  11428. + __func__, mp_opt->sub);
  11429. + break;
  11430. + }
  11431. +}
  11432. +
  11433. +int mptcp_check_rtt(const struct tcp_sock *tp, int time)
  11434. +{
  11435. + struct mptcp_cb *mpcb = tp->mpcb;
  11436. + struct sock *sk;
  11437. + u32 rtt_max = 0;
  11438. +
  11439. + /* In MPTCP, we take the max delay across all flows,
  11440. + * in order to take into account meta-reordering buffers.
  11441. + */
  11442. + mptcp_for_each_sk(mpcb, sk) {
  11443. + if (!mptcp_sk_can_recv(sk))
  11444. + continue;
  11445. +
  11446. + if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)
  11447. + rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;
  11448. + }
  11449. + if (time < (rtt_max >> 3) || !rtt_max)
  11450. + return 1;
  11451. +
  11452. + return 0;
  11453. +}
  11454. +
  11455. +static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
  11456. +{
  11457. + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
  11458. +
  11459. + if (mpadd->ipver == 4) {
  11460. + __be16 port = 0;
  11461. + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)
  11462. + port = mpadd->u.v4.port;
  11463. +
  11464. + mptcp_v4_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v4.addr, port,
  11465. + mpadd->addr_id);
  11466. +#if IS_ENABLED(CONFIG_IPV6)
  11467. + } else if (mpadd->ipver == 6) {
  11468. + __be16 port = 0;
  11469. + if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)
  11470. + port = mpadd->u.v6.port;
  11471. +
  11472. + mptcp_v6_add_raddress(tcp_sk(sk)->mpcb, &mpadd->u.v6.addr, port,
  11473. + mpadd->addr_id);
  11474. +#endif /* CONFIG_IPV6 */
  11475. + }
  11476. +}
  11477. +
  11478. +static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
  11479. +{
  11480. + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
  11481. + int i;
  11482. + u8 rem_id;
  11483. +
  11484. + for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
  11485. + rem_id = (&mprem->addrs_id)[i];
  11486. + if (!mptcp_rem_raddress(tcp_sk(sk)->mpcb, rem_id))
  11487. + mptcp_send_reset_rem_id(tcp_sk(sk)->mpcb, rem_id);
  11488. + }
  11489. +}
  11490. +
  11491. +static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
  11492. +{
  11493. + struct tcphdr *th = tcp_hdr(skb);
  11494. + unsigned char *ptr;
  11495. + int length = (th->doff * 4) - sizeof(struct tcphdr);
  11496. +
  11497. + /* Jump through the options to check whether ADD_ADDR is there */
  11498. + ptr = (unsigned char *)(th + 1);
  11499. + while (length > 0) {
  11500. + int opcode = *ptr++;
  11501. + int opsize;
  11502. +
  11503. + switch (opcode) {
  11504. + case TCPOPT_EOL:
  11505. + return;
  11506. + case TCPOPT_NOP:
  11507. + length--;
  11508. + continue;
  11509. + default:
  11510. + opsize = *ptr++;
  11511. + if (opsize < 2)
  11512. + return;
  11513. + if (opsize > length)
  11514. + return; /* don't parse partial options */
  11515. + if (opcode == TCPOPT_MPTCP &&
  11516. + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
  11517. +#if IS_ENABLED(CONFIG_IPV6)
  11518. + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
  11519. + if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
  11520. + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
  11521. + (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
  11522. + opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))
  11523. +#else
  11524. + if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
  11525. + opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)
  11526. +#endif /* CONFIG_IPV6 */
  11527. + goto cont;
  11528. +
  11529. + mptcp_handle_add_addr(ptr, sk);
  11530. + }
  11531. + if (opcode == TCPOPT_MPTCP &&
  11532. + ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
  11533. + if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
  11534. + goto cont;
  11535. +
  11536. + mptcp_handle_rem_addr(ptr, sk);
  11537. + }
  11538. +cont:
  11539. + ptr += opsize - 2;
  11540. + length -= opsize;
  11541. + }
  11542. + }
  11543. + return;
  11544. +}
  11545. +
  11546. +static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
  11547. +{
  11548. + struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
  11549. + struct sock *meta_sk = mptcp_meta_sk(sk);
  11550. + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
  11551. +
  11552. + if (unlikely(mptcp->rx_opt.mp_fail)) {
  11553. + mptcp->rx_opt.mp_fail = 0;
  11554. +
  11555. + if (!th->rst && !mpcb->infinite_mapping_snd) {
  11556. + struct sock *sk_it;
  11557. +
  11558. + mpcb->send_infinite_mapping = 1;
  11559. + /* We resend everything that has not been acknowledged */
  11560. + meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
  11561. +
  11562. + /* We artificially restart the whole send-queue. Thus,
  11563. + * it is as if no packets are in flight
  11564. + */
  11565. + tcp_sk(meta_sk)->packets_out = 0;
  11566. +
  11567. + /* If the snd_nxt already wrapped around, we have to
  11568. + * undo the wrapping, as we are restarting from snd_una
  11569. + * on.
  11570. + */
  11571. + if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {
  11572. + mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
  11573. + mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
  11574. + }
  11575. + tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;
  11576. +
  11577. + /* Trigger a sending on the meta. */
  11578. + mptcp_push_pending_frames(meta_sk);
  11579. +
  11580. + mptcp_for_each_sk(mpcb, sk_it) {
  11581. + if (sk != sk_it)
  11582. + mptcp_sub_force_close(sk_it);
  11583. + }
  11584. + }
  11585. +
  11586. + return 0;
  11587. + }
  11588. +
  11589. + if (unlikely(mptcp->rx_opt.mp_fclose)) {
  11590. + struct sock *sk_it, *tmpsk;
  11591. +
  11592. + mptcp->rx_opt.mp_fclose = 0;
  11593. + if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)
  11594. + return 0;
  11595. +
  11596. + if (tcp_need_reset(sk->sk_state))
  11597. + tcp_send_active_reset(sk, GFP_ATOMIC);
  11598. +
  11599. + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
  11600. + mptcp_sub_force_close(sk_it);
  11601. +
  11602. + tcp_reset(meta_sk);
  11603. +
  11604. + return 1;
  11605. + }
  11606. +
  11607. + return 0;
  11608. +}
  11609. +
  11610. +static inline void mptcp_path_array_check(struct sock *meta_sk)
  11611. +{
  11612. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  11613. +
  11614. + if (unlikely(mpcb->list_rcvd)) {
  11615. + mpcb->list_rcvd = 0;
  11616. + if (mpcb->pm_ops->new_remote_address)
  11617. + mpcb->pm_ops->new_remote_address(meta_sk);
  11618. + }
  11619. +}
  11620. +
  11621. +int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, struct sk_buff *skb)
  11622. +{
  11623. + struct tcp_sock *tp = tcp_sk(sk);
  11624. + struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
  11625. +
  11626. + if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
  11627. + return 0;
  11628. +
  11629. + if (mptcp_mp_fail_rcvd(sk, th))
  11630. + return 1;
  11631. +
  11632. + /* RFC 6824, Section 3.3:
  11633. + * If a checksum is not present when its use has been negotiated, the
  11634. + * receiver MUST close the subflow with a RST as it is considered broken.
  11635. + */
  11636. + if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
  11637. + !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
  11638. + if (tcp_need_reset(sk->sk_state))
  11639. + tcp_send_active_reset(sk, GFP_ATOMIC);
  11640. +
  11641. + mptcp_sub_force_close(sk);
  11642. + return 1;
  11643. + }
  11644. +
  11645. + /* We have to acknowledge retransmissions of the third
  11646. + * ack.
  11647. + */
  11648. + if (mopt->join_ack) {
  11649. + tcp_send_delayed_ack(sk);
  11650. + mopt->join_ack = 0;
  11651. + }
  11652. +
  11653. + if (mopt->saw_add_addr || mopt->saw_rem_addr) {
  11654. + if (mopt->more_add_addr || mopt->more_rem_addr) {
  11655. + mptcp_parse_addropt(skb, sk);
  11656. + } else {
  11657. + if (mopt->saw_add_addr)
  11658. + mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
  11659. + if (mopt->saw_rem_addr)
  11660. + mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
  11661. + }
  11662. +
  11663. + mopt->more_add_addr = 0;
  11664. + mopt->saw_add_addr = 0;
  11665. + mopt->more_rem_addr = 0;
  11666. + mopt->saw_rem_addr = 0;
  11667. + }
  11668. + if (mopt->saw_low_prio) {
  11669. + if (mopt->saw_low_prio == 1) {
  11670. + tp->mptcp->rcv_low_prio = mopt->low_prio;
  11671. + } else {
  11672. + struct sock *sk_it;
  11673. + mptcp_for_each_sk(tp->mpcb, sk_it) {
  11674. + struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
  11675. + if (mptcp->rem_id == mopt->prio_addr_id)
  11676. + mptcp->rcv_low_prio = mopt->low_prio;
  11677. + }
  11678. + }
  11679. + mopt->saw_low_prio = 0;
  11680. + }
  11681. +
  11682. + mptcp_data_ack(sk, skb);
  11683. +
  11684. + mptcp_path_array_check(mptcp_meta_sk(sk));
  11685. + /* Socket may have been mp_killed by a REMOVE_ADDR */
  11686. + if (tp->mp_killed)
  11687. + return 1;
  11688. +
  11689. + return 0;
  11690. +}
  11691. +
  11692. +/* The skptr is needed, because if we become MPTCP-capable, we have to switch
  11693. + * from meta-socket to master-socket.
  11694. + *
  11695. + * @return: 1 - we want to reset this connection
  11696. + * 2 - we want to discard the received syn/ack
  11697. + * 0 - everything is fine - continue
  11698. + */
  11699. +int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
  11700. + struct sk_buff *skb,
  11701. + struct mptcp_options_received *mopt)
  11702. +{
  11703. + struct tcp_sock *tp = tcp_sk(sk);
  11704. +
  11705. + if (tp->mpc) {
  11706. + u8 hash_mac_check[20];
  11707. + struct mptcp_cb *mpcb = tp->mpcb;
  11708. +
  11709. + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
  11710. + (u8 *)&mpcb->mptcp_loc_key,
  11711. + (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
  11712. + (u8 *)&tp->mptcp->mptcp_loc_nonce,
  11713. + (u32 *)hash_mac_check);
  11714. + if (memcmp(hash_mac_check,
  11715. + (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
  11716. + mptcp_sub_force_close(sk);
  11717. + return 1;
  11718. + }
  11719. +
  11720. + /* Set this flag in order to postpone data sending
  11721. + * until the 4th ack arrives.
  11722. + */
  11723. + tp->mptcp->pre_established = 1;
  11724. + tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
  11725. +
  11726. + mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
  11727. + (u8 *)&mpcb->mptcp_rem_key,
  11728. + (u8 *)&tp->mptcp->mptcp_loc_nonce,
  11729. + (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
  11730. + (u32 *)&tp->mptcp->sender_mac[0]);
  11731. +
  11732. + } else if (mopt->saw_mpc) {
  11733. + if (mptcp_create_master_sk(sk, mopt->mptcp_key,
  11734. + ntohs(tcp_hdr(skb)->window)))
  11735. + return 2;
  11736. +
  11737. + sk = tcp_sk(sk)->mpcb->master_sk;
  11738. + *skptr = sk;
  11739. + tp = tcp_sk(sk);
  11740. +
  11741. + /* snd_nxt - 1, because it has been incremented
  11742. + * by tcp_connect for the SYN
  11743. + */
  11744. + tp->mptcp->snt_isn = tp->snd_nxt - 1;
  11745. + tp->mpcb->dss_csum = mopt->dss_csum;
  11746. + tp->mptcp->include_mpc = 1;
  11747. +
  11748. + sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);
  11749. + sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;
  11750. +
  11751. + mptcp_update_metasocket(sk, mptcp_meta_sk(sk));
  11752. +
  11753. + /* hold in mptcp_inherit_sk due to initialization to 2 */
  11754. + sock_put(sk);
  11755. + } else {
  11756. + tp->request_mptcp = 0;
  11757. +
  11758. + if (tp->inside_tk_table)
  11759. + mptcp_hash_remove(tp);
  11760. + }
  11761. +
  11762. + if (tp->mpc)
  11763. + tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
  11764. +
  11765. + return 0;
  11766. +}
  11767. +
  11768. +bool mptcp_should_expand_sndbuf(const struct sock *sk)
  11769. +{
  11770. + struct sock *sk_it;
  11771. + struct sock *meta_sk = mptcp_meta_sk(sk);
  11772. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  11773. + int cnt_backups = 0;
  11774. + int backup_available = 0;
  11775. +
  11776. + /* We circumvent this check in tcp_check_space, because we want to
  11777. + * always call sk_write_space. So, we reproduce the check here.
  11778. + */
  11779. + if (!meta_sk->sk_socket ||
  11780. + !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
  11781. + return false;
  11782. +
  11783. + /* If the user specified a specific send buffer setting, do
  11784. + * not modify it.
  11785. + */
  11786. + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
  11787. + return false;
  11788. +
  11789. + /* If we are under global TCP memory pressure, do not expand. */
  11790. + if (sk_under_memory_pressure(meta_sk))
  11791. + return false;
  11792. +
  11793. + /* If we are under soft global TCP memory pressure, do not expand. */
  11794. + if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
  11795. + return false;
  11796. +
  11797. +
  11798. + /* For MPTCP we look for a subsocket that could send data.
  11799. + * If we found one, then we update the send-buffer.
  11800. + */
  11801. + mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
  11802. + struct tcp_sock *tp_it = tcp_sk(sk_it);
  11803. +
  11804. + if (!mptcp_sk_can_send(sk_it))
  11805. + continue;
  11806. +
  11807. + /* Backup-flows have to be counted - if there is no other
  11808. + * subflow we take the backup-flow into account. */
  11809. + if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
  11810. + cnt_backups++;
  11811. + }
  11812. +
  11813. + if (tp_it->packets_out < tp_it->snd_cwnd) {
  11814. + if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
  11815. + backup_available = 1;
  11816. + continue;
  11817. + }
  11818. + return true;
  11819. + }
  11820. + }
  11821. +
  11822. + /* Backup-flow is available for sending - update send-buffer */
  11823. + if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)
  11824. + return true;
  11825. + return false;
  11826. +}
  11827. +
  11828. +void mptcp_init_buffer_space(struct sock *sk)
  11829. +{
  11830. + struct tcp_sock *tp = tcp_sk(sk);
  11831. + struct sock *meta_sk = mptcp_meta_sk(sk);
  11832. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  11833. + int space;
  11834. +
  11835. + tcp_init_buffer_space(sk);
  11836. +
  11837. + if (is_master_tp(tp)) {
  11838. + /* If there is only one subflow, we just use regular TCP
  11839. + * autotuning. User-locks are handled already by
  11840. + * tcp_init_buffer_space
  11841. + */
  11842. + meta_tp->window_clamp = tp->window_clamp;
  11843. + meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
  11844. + meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
  11845. + meta_sk->sk_sndbuf = sk->sk_sndbuf;
  11846. +
  11847. + return;
  11848. + }
  11849. +
  11850. + if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
  11851. + goto snd_buf;
  11852. +
  11853. + /* Adding a new subflow to the rcv-buffer space. We make a simple
  11854. + * addition, to give some space to allow traffic on the new subflow.
  11855. + * Autotuning will increase it further later on.
  11856. + */
  11857. + space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
  11858. + if (space > meta_sk->sk_rcvbuf) {
  11859. + meta_tp->window_clamp += tp->window_clamp;
  11860. + meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
  11861. + meta_sk->sk_rcvbuf = space;
  11862. + }
  11863. +
  11864. +snd_buf:
  11865. + if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
  11866. + return;
  11867. +
  11868. + /* Adding a new subflow to the send-buffer space. We make a simple
  11869. + * addition, to give some space to allow traffic on the new subflow.
  11870. + * Autotuning will increase it further later on.
  11871. + */
  11872. + space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);
  11873. + if (space > meta_sk->sk_sndbuf) {
  11874. + meta_sk->sk_sndbuf = space;
  11875. + meta_sk->sk_write_space(meta_sk);
  11876. + }
  11877. +}
  11878. +
  11879. +void mptcp_tcp_set_rto(struct sock *sk)
  11880. +{
  11881. + tcp_set_rto(sk);
  11882. + mptcp_set_rto(sk);
  11883. +}
  11884. diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c
  11885. new file mode 100644
  11886. index 0000000..b6053f1
  11887. --- /dev/null
  11888. +++ b/net/mptcp/mptcp_ipv4.c
  11889. @@ -0,0 +1,603 @@
  11890. +/*
  11891. + * MPTCP implementation - IPv4-specific functions
  11892. + *
  11893. + * Initial Design & Implementation:
  11894. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  11895. + *
  11896. + * Current Maintainer:
  11897. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  11898. + *
  11899. + * Additional authors:
  11900. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  11901. + * Gregory Detal <gregory.detal@uclouvain.be>
  11902. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  11903. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  11904. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  11905. + * Andreas Ripke <ripke@neclab.eu>
  11906. + * Vlad Dogaru <vlad.dogaru@intel.com>
  11907. + * Octavian Purdila <octavian.purdila@intel.com>
  11908. + * John Ronan <jronan@tssg.org>
  11909. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  11910. + * Brandon Heller <brandonh@stanford.edu>
  11911. + *
  11912. + *
  11913. + * This program is free software; you can redistribute it and/or
  11914. + * modify it under the terms of the GNU General Public License
  11915. + * as published by the Free Software Foundation; either version
  11916. + * 2 of the License, or (at your option) any later version.
  11917. + */
  11918. +
  11919. +#include <linux/export.h>
  11920. +#include <linux/ip.h>
  11921. +#include <linux/list.h>
  11922. +#include <linux/skbuff.h>
  11923. +#include <linux/spinlock.h>
  11924. +#include <linux/tcp.h>
  11925. +
  11926. +#include <net/inet_common.h>
  11927. +#include <net/inet_connection_sock.h>
  11928. +#include <net/mptcp.h>
  11929. +#include <net/mptcp_v4.h>
  11930. +#include <net/request_sock.h>
  11931. +#include <net/tcp.h>
  11932. +
  11933. +u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
  11934. + u32 seq)
  11935. +{
  11936. + u32 hash[MD5_DIGEST_WORDS];
  11937. +
  11938. + hash[0] = (__force u32)saddr;
  11939. + hash[1] = (__force u32)daddr;
  11940. + hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
  11941. + hash[3] = seq;
  11942. +
  11943. + md5_transform(hash, mptcp_secret);
  11944. +
  11945. + return hash[0];
  11946. +}
  11947. +
  11948. +u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
  11949. +{
  11950. + u32 hash[MD5_DIGEST_WORDS];
  11951. +
  11952. + hash[0] = (__force u32)saddr;
  11953. + hash[1] = (__force u32)daddr;
  11954. + hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
  11955. + hash[3] = mptcp_key_seed++;
  11956. +
  11957. + md5_transform(hash, mptcp_secret);
  11958. +
  11959. + return *((u64 *)hash);
  11960. +}
  11961. +
  11962. +
  11963. +static void mptcp_v4_reqsk_destructor(struct request_sock *req)
  11964. +{
  11965. + mptcp_reqsk_destructor(req);
  11966. +
  11967. + tcp_v4_reqsk_destructor(req);
  11968. +}
  11969. +
  11970. +/* Similar to tcp_request_sock_ops */
  11971. +struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
  11972. + .family = PF_INET,
  11973. + .obj_size = sizeof(struct mptcp_request_sock),
  11974. + .rtx_syn_ack = tcp_v4_rtx_synack,
  11975. + .send_ack = tcp_v4_reqsk_send_ack,
  11976. + .destructor = mptcp_v4_reqsk_destructor,
  11977. + .send_reset = tcp_v4_send_reset,
  11978. + .syn_ack_timeout = tcp_syn_ack_timeout,
  11979. +};
  11980. +
  11981. +static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,
  11982. + struct request_sock *req,
  11983. + unsigned long timeout)
  11984. +{
  11985. + const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
  11986. + inet_rsk(req)->ir_rmt_port,
  11987. + 0, MPTCP_HASH_SIZE);
  11988. + /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not
  11989. + * want to reset the keepalive-timer (responsible for retransmitting
  11990. + * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
  11991. + * overload the keepalive timer. Also, it's not a big deal, because the
  11992. + * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
  11993. + * if the third ACK gets lost, the client will handle the retransmission
  11994. + * anyways. If our SYN/ACK gets lost, the client will retransmit the
  11995. + * SYN.
  11996. + */
  11997. + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
  11998. + struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
  11999. + const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
  12000. + inet_rsk(req)->ir_rmt_port,
  12001. + lopt->hash_rnd, lopt->nr_table_entries);
  12002. +
  12003. + reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
  12004. + reqsk_queue_added(&meta_icsk->icsk_accept_queue);
  12005. +
  12006. + spin_lock(&mptcp_reqsk_hlock);
  12007. + list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h1]);
  12008. + spin_unlock(&mptcp_reqsk_hlock);
  12009. +}
  12010. +
  12011. +/* Similar to tcp_v4_conn_request */
  12012. +static void mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
  12013. +{
  12014. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  12015. + struct tcp_options_received tmp_opt;
  12016. + struct mptcp_options_received mopt;
  12017. + struct request_sock *req;
  12018. + struct inet_request_sock *ireq;
  12019. + struct mptcp_request_sock *mtreq;
  12020. + struct dst_entry *dst = NULL;
  12021. + u8 mptcp_hash_mac[20];
  12022. + __be32 saddr = ip_hdr(skb)->saddr;
  12023. + __be32 daddr = ip_hdr(skb)->daddr;
  12024. + __u32 isn = TCP_SKB_CB(skb)->when;
  12025. + int want_cookie = 0;
  12026. + union inet_addr addr;
  12027. +
  12028. + tcp_clear_options(&tmp_opt);
  12029. + mptcp_init_mp_opt(&mopt);
  12030. + tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
  12031. + tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss;
  12032. + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
  12033. +
  12034. + req = inet_reqsk_alloc(&mptcp_request_sock_ops);
  12035. + if (!req)
  12036. + return;
  12037. +
  12038. +#ifdef CONFIG_TCP_MD5SIG
  12039. + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
  12040. +#endif
  12041. +
  12042. + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
  12043. + tcp_openreq_init(req, &tmp_opt, skb);
  12044. +
  12045. + ireq = inet_rsk(req);
  12046. + ireq->ir_loc_addr = daddr;
  12047. + ireq->ir_rmt_addr = saddr;
  12048. + ireq->no_srccheck = inet_sk(meta_sk)->transparent;
  12049. + ireq->opt = tcp_v4_save_options(skb);
  12050. +
  12051. + if (security_inet_conn_request(meta_sk, skb, req))
  12052. + goto drop_and_free;
  12053. +
  12054. + if (!want_cookie || tmp_opt.tstamp_ok)
  12055. + TCP_ECN_create_request(req, skb, sock_net(meta_sk));
  12056. +
  12057. + if (!isn) {
  12058. + struct flowi4 fl4;
  12059. +
  12060. + /* VJ's idea. We save last timestamp seen
  12061. + * from the destination in peer table, when entering
  12062. + * state TIME-WAIT, and check against it before
  12063. + * accepting new connection request.
  12064. + *
  12065. + * If "isn" is not zero, this request hit alive
  12066. + * timewait bucket, so that all the necessary checks
  12067. + * are made in the function processing timewait state.
  12068. + */
  12069. + if (tmp_opt.saw_tstamp &&
  12070. + tcp_death_row.sysctl_tw_recycle &&
  12071. + (dst = inet_csk_route_req(meta_sk, &fl4, req)) != NULL &&
  12072. + fl4.daddr == saddr) {
  12073. + if (!tcp_peer_is_proven(req, dst, true)) {
  12074. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED);
  12075. + goto drop_and_release;
  12076. + }
  12077. + }
  12078. + /* Kill the following clause, if you dislike this way. */
  12079. + else if (!sysctl_tcp_syncookies &&
  12080. + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) <
  12081. + (sysctl_max_syn_backlog >> 2)) &&
  12082. + !tcp_peer_is_proven(req, dst, false)) {
  12083. + /* Without syncookies last quarter of
  12084. + * backlog is filled with destinations,
  12085. + * proven to be alive.
  12086. + * It means that we continue to communicate
  12087. + * to destinations, already remembered
  12088. + * to the moment of synflood.
  12089. + */
  12090. + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
  12091. + &saddr, ntohs(tcp_hdr(skb)->source));
  12092. + goto drop_and_release;
  12093. + }
  12094. +
  12095. + isn = tcp_v4_init_sequence(skb);
  12096. + }
  12097. + tcp_rsk(req)->snt_isn = isn;
  12098. + tcp_rsk(req)->snt_synack = tcp_time_stamp;
  12099. + tcp_rsk(req)->listener = NULL;
  12100. +
  12101. + mtreq = mptcp_rsk(req);
  12102. + mtreq->mpcb = mpcb;
  12103. + INIT_LIST_HEAD(&mtreq->collide_tuple);
  12104. + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
  12105. + mtreq->mptcp_rem_key = mpcb->mptcp_rem_key;
  12106. + mtreq->mptcp_loc_key = mpcb->mptcp_loc_key;
  12107. + mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(saddr, daddr,
  12108. + tcp_hdr(skb)->source,
  12109. + tcp_hdr(skb)->dest, isn);
  12110. + mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key,
  12111. + (u8 *)&mtreq->mptcp_rem_key,
  12112. + (u8 *)&mtreq->mptcp_loc_nonce,
  12113. + (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
  12114. + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
  12115. +
  12116. + addr.ip = ireq->ir_loc_addr;
  12117. + mtreq->loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(meta_sk));
  12118. + if (mtreq->loc_id == -1) /* Address not part of the allowed ones */
  12119. + goto drop_and_release;
  12120. + mtreq->rem_id = mopt.rem_id;
  12121. + mtreq->low_prio = mopt.low_prio;
  12122. + tcp_rsk(req)->saw_mpc = 1;
  12123. +
  12124. + if (tcp_v4_send_synack(meta_sk, dst, req, skb_get_queue_mapping(skb)))
  12125. + goto drop_and_free;
  12126. +
  12127. + /* Adding to request queue in metasocket */
  12128. + mptcp_v4_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT);
  12129. +
  12130. + return;
  12131. +
  12132. +drop_and_release:
  12133. + dst_release(dst);
  12134. +drop_and_free:
  12135. + reqsk_free(req);
  12136. + return;
  12137. +}
  12138. +
  12139. +int mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
  12140. +{
  12141. + int i;
  12142. +
  12143. + for (i = 0; i < MPTCP_MAX_ADDR; i++) {
  12144. + if (!((1 << i) & mpcb->rem4_bits))
  12145. + continue;
  12146. +
  12147. + if (mpcb->remaddr4[i].rem4_id == id) {
  12148. + /* remove address from bitfield */
  12149. + mpcb->rem4_bits &= ~(1 << i);
  12150. +
  12151. + return 0;
  12152. + }
  12153. + }
  12154. +
  12155. + return -1;
  12156. +}
  12157. +
  12158. +/* Based on function tcp_v4_conn_request (tcp_ipv4.c)
  12159. + * Returns -1 if there is no space anymore to store an additional
  12160. + * address
  12161. + */
  12162. +int mptcp_v4_add_raddress(struct mptcp_cb *mpcb, const struct in_addr *addr,
  12163. + __be16 port, u8 id)
  12164. +{
  12165. + int i;
  12166. + struct mptcp_rem4 *rem4;
  12167. +
  12168. + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
  12169. + rem4 = &mpcb->remaddr4[i];
  12170. +
  12171. + /* Address is already in the list --- continue */
  12172. + if (rem4->rem4_id == id &&
  12173. + rem4->addr.s_addr == addr->s_addr && rem4->port == port)
  12174. + return 0;
  12175. +
  12176. + /* This may be the case, when the peer is behind a NAT. He is
  12177. + * trying to JOIN, thus sending the JOIN with a certain ID.
  12178. + * However the src_addr of the IP-packet has been changed. We
  12179. + * update the addr in the list, because this is the address as
  12180. + * OUR BOX sees it.
  12181. + */
  12182. + if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
  12183. + /* update the address */
  12184. + mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
  12185. + __func__, &rem4->addr.s_addr,
  12186. + &addr->s_addr, id);
  12187. + rem4->addr.s_addr = addr->s_addr;
  12188. + rem4->port = port;
  12189. + mpcb->list_rcvd = 1;
  12190. + return 0;
  12191. + }
  12192. + }
  12193. +
  12194. + i = mptcp_find_free_index(mpcb->rem4_bits);
  12195. + /* Do we have already the maximum number of local/remote addresses? */
  12196. + if (i < 0) {
  12197. + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
  12198. + __func__, MPTCP_MAX_ADDR, &addr->s_addr);
  12199. + return -1;
  12200. + }
  12201. +
  12202. + rem4 = &mpcb->remaddr4[i];
  12203. +
  12204. + /* Address is not known yet, store it */
  12205. + rem4->addr.s_addr = addr->s_addr;
  12206. + rem4->port = port;
  12207. + rem4->bitfield = 0;
  12208. + rem4->retry_bitfield = 0;
  12209. + rem4->rem4_id = id;
  12210. + mpcb->list_rcvd = 1;
  12211. + mpcb->rem4_bits |= (1 << i);
  12212. +
  12213. + return 0;
  12214. +}
  12215. +
  12216. +/* Sets the bitfield of the remote-address field
  12217. + * local address is not set as it will disappear with the global address-list
  12218. + */
  12219. +void mptcp_v4_set_init_addr_bit(struct mptcp_cb *mpcb, __be32 daddr, int index)
  12220. +{
  12221. + int i;
  12222. +
  12223. + mptcp_for_each_bit_set(mpcb->rem4_bits, i) {
  12224. + if (mpcb->remaddr4[i].addr.s_addr == daddr) {
  12225. + mpcb->remaddr4[i].bitfield |= (1 << index);
  12226. + return;
  12227. + }
  12228. + }
  12229. +}
  12230. +
  12231. +/* We only process join requests here. (either the SYN or the final ACK) */
  12232. +int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
  12233. +{
  12234. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  12235. + struct sock *child, *rsk = NULL;
  12236. + int ret;
  12237. +
  12238. + if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
  12239. + struct tcphdr *th = tcp_hdr(skb);
  12240. + const struct iphdr *iph = ip_hdr(skb);
  12241. + struct sock *sk;
  12242. +
  12243. + sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
  12244. + iph->saddr, th->source, iph->daddr,
  12245. + th->dest, inet_iif(skb));
  12246. +
  12247. + if (!sk) {
  12248. + kfree_skb(skb);
  12249. + return 0;
  12250. + }
  12251. + if (is_meta_sk(sk)) {
  12252. + WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
  12253. + kfree_skb(skb);
  12254. + sock_put(sk);
  12255. + return 0;
  12256. + }
  12257. +
  12258. + if (sk->sk_state == TCP_TIME_WAIT) {
  12259. + inet_twsk_put(inet_twsk(sk));
  12260. + kfree_skb(skb);
  12261. + return 0;
  12262. + }
  12263. +
  12264. + ret = tcp_v4_do_rcv(sk, skb);
  12265. + sock_put(sk);
  12266. +
  12267. + return ret;
  12268. + }
  12269. + TCP_SKB_CB(skb)->mptcp_flags = 0;
  12270. +
  12271. + /* Has been removed from the tk-table. Thus, no new subflows.
  12272. + *
  12273. + * Check for close-state is necessary, because we may have been closed
  12274. + * without passing by mptcp_close().
  12275. + *
  12276. + * When falling back, no new subflows are allowed either.
  12277. + */
  12278. + if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
  12279. + mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
  12280. + goto reset_and_discard;
  12281. +
  12282. + child = tcp_v4_hnd_req(meta_sk, skb);
  12283. +
  12284. + if (!child)
  12285. + goto discard;
  12286. +
  12287. + if (child != meta_sk) {
  12288. + sock_rps_save_rxhash(child, skb);
  12289. + /* We don't call tcp_child_process here, because we hold
  12290. + * already the meta-sk-lock and are sure that it is not owned
  12291. + * by the user.
  12292. + */
  12293. + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
  12294. + bh_unlock_sock(child);
  12295. + sock_put(child);
  12296. + if (ret) {
  12297. + rsk = child;
  12298. + goto reset_and_discard;
  12299. + }
  12300. + } else {
  12301. + if (tcp_hdr(skb)->syn) {
  12302. + struct mp_join *join_opt = mptcp_find_join(skb);
  12303. + /* Currently we make two calls to mptcp_find_join(). This
  12304. + * can probably be optimized.
  12305. + */
  12306. + if (mptcp_v4_add_raddress(mpcb,
  12307. + (struct in_addr *)&ip_hdr(skb)->saddr,
  12308. + 0,
  12309. + join_opt->addr_id) < 0)
  12310. + goto reset_and_discard;
  12311. + mpcb->list_rcvd = 0;
  12312. +
  12313. + mptcp_v4_join_request(meta_sk, skb);
  12314. + goto discard;
  12315. + }
  12316. + goto reset_and_discard;
  12317. + }
  12318. + return 0;
  12319. +
  12320. +reset_and_discard:
  12321. + tcp_v4_send_reset(rsk, skb);
  12322. +discard:
  12323. + kfree_skb(skb);
  12324. + return 0;
  12325. +}
  12326. +
  12327. +/* After this, the ref count of the meta_sk associated with the request_sock
  12328. + * is incremented. Thus it is the responsibility of the caller
  12329. + * to call sock_put() when the reference is not needed anymore.
  12330. + */
  12331. +struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
  12332. + const __be32 laddr, const struct net *net)
  12333. +{
  12334. + struct mptcp_request_sock *mtreq;
  12335. + struct sock *meta_sk = NULL;
  12336. +
  12337. + spin_lock(&mptcp_reqsk_hlock);
  12338. + list_for_each_entry(mtreq,
  12339. + &mptcp_reqsk_htb[inet_synq_hash(raddr, rport, 0,
  12340. + MPTCP_HASH_SIZE)],
  12341. + collide_tuple) {
  12342. + struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));
  12343. + meta_sk = mtreq->mpcb->meta_sk;
  12344. +
  12345. + if (ireq->ir_rmt_port == rport &&
  12346. + ireq->ir_rmt_addr == raddr &&
  12347. + ireq->ir_loc_addr == laddr &&
  12348. + rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&
  12349. + net_eq(net, sock_net(meta_sk)))
  12350. + break;
  12351. + meta_sk = NULL;
  12352. + }
  12353. +
  12354. + if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
  12355. + meta_sk = NULL;
  12356. + spin_unlock(&mptcp_reqsk_hlock);
  12357. +
  12358. + return meta_sk;
  12359. +}
  12360. +
  12361. +/* Create a new IPv4 subflow.
  12362. + *
  12363. + * We are in user-context and meta-sock-lock is hold.
  12364. + */
  12365. +int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
  12366. + struct mptcp_rem4 *rem)
  12367. +{
  12368. + struct tcp_sock *tp;
  12369. + struct sock *sk;
  12370. + struct sockaddr_in loc_in, rem_in;
  12371. + struct socket sock;
  12372. + int ulid_size = 0, ret;
  12373. +
  12374. + /** First, create and prepare the new socket */
  12375. +
  12376. + sock.type = meta_sk->sk_socket->type;
  12377. + sock.state = SS_UNCONNECTED;
  12378. + sock.wq = meta_sk->sk_socket->wq;
  12379. + sock.file = meta_sk->sk_socket->file;
  12380. + sock.ops = NULL;
  12381. +
  12382. + ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
  12383. + if (unlikely(ret < 0)) {
  12384. + mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
  12385. + return ret;
  12386. + }
  12387. +
  12388. + sk = sock.sk;
  12389. + tp = tcp_sk(sk);
  12390. +
  12391. + /* All subsockets need the MPTCP-lock-class */
  12392. + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
  12393. + lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
  12394. +
  12395. + if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))
  12396. + goto error;
  12397. +
  12398. + tp->mptcp->slave_sk = 1;
  12399. + tp->mptcp->low_prio = loc->low_prio;
  12400. +
  12401. + /* Initializing the timer for an MPTCP subflow */
  12402. + setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
  12403. +
  12404. + /** Then, connect the socket to the peer */
  12405. +
  12406. + ulid_size = sizeof(struct sockaddr_in);
  12407. + loc_in.sin_family = AF_INET;
  12408. + rem_in.sin_family = AF_INET;
  12409. + loc_in.sin_port = 0;
  12410. + if (rem->port)
  12411. + rem_in.sin_port = rem->port;
  12412. + else
  12413. + rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
  12414. + loc_in.sin_addr = loc->addr;
  12415. + rem_in.sin_addr = rem->addr;
  12416. +
  12417. + ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size);
  12418. + if (ret < 0) {
  12419. + mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
  12420. + __func__, ret);
  12421. + goto error;
  12422. + }
  12423. +
  12424. + mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",
  12425. + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
  12426. + tp->mptcp->path_index, &loc_in.sin_addr,
  12427. + ntohs(loc_in.sin_port), &rem_in.sin_addr,
  12428. + ntohs(rem_in.sin_port));
  12429. +
  12430. + ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
  12431. + ulid_size, O_NONBLOCK);
  12432. + if (ret < 0 && ret != -EINPROGRESS) {
  12433. + mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
  12434. + __func__, ret);
  12435. + goto error;
  12436. + }
  12437. +
  12438. + sk_set_socket(sk, meta_sk->sk_socket);
  12439. + sk->sk_wq = meta_sk->sk_wq;
  12440. +
  12441. + return 0;
  12442. +
  12443. +error:
  12444. + /* May happen if mptcp_add_sock fails first */
  12445. + if (!tp->mpc) {
  12446. + tcp_close(sk, 0);
  12447. + } else {
  12448. + local_bh_disable();
  12449. + mptcp_sub_force_close(sk);
  12450. + local_bh_enable();
  12451. + }
  12452. + return ret;
  12453. +}
  12454. +EXPORT_SYMBOL(mptcp_init4_subsockets);
  12455. +
  12456. +/* General initialization of IPv4 for MPTCP */
  12457. +int mptcp_pm_v4_init(void)
  12458. +{
  12459. + int ret = 0;
  12460. + struct request_sock_ops *ops = &mptcp_request_sock_ops;
  12461. +
  12462. + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
  12463. + if (ops->slab_name == NULL) {
  12464. + ret = -ENOMEM;
  12465. + goto out;
  12466. + }
  12467. +
  12468. + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
  12469. + SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
  12470. + NULL);
  12471. +
  12472. + if (ops->slab == NULL) {
  12473. + ret = -ENOMEM;
  12474. + goto err_reqsk_create;
  12475. + }
  12476. +
  12477. +out:
  12478. + return ret;
  12479. +
  12480. +err_reqsk_create:
  12481. + kfree(ops->slab_name);
  12482. + ops->slab_name = NULL;
  12483. + goto out;
  12484. +}
  12485. +
  12486. +void mptcp_pm_v4_undo(void)
  12487. +{
  12488. + kmem_cache_destroy(mptcp_request_sock_ops.slab);
  12489. + kfree(mptcp_request_sock_ops.slab_name);
  12490. +}
  12491. +
  12492. +
  12493. diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c
  12494. new file mode 100644
  12495. index 0000000..b6b444d
  12496. --- /dev/null
  12497. +++ b/net/mptcp/mptcp_ipv6.c
  12498. @@ -0,0 +1,822 @@
  12499. +/*
  12500. + * MPTCP implementation - IPv6-specific functions
  12501. + *
  12502. + * Initial Design & Implementation:
  12503. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  12504. + *
  12505. + * Current Maintainer:
  12506. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  12507. + *
  12508. + * Additional authors:
  12509. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  12510. + * Gregory Detal <gregory.detal@uclouvain.be>
  12511. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  12512. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  12513. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  12514. + * Andreas Ripke <ripke@neclab.eu>
  12515. + * Vlad Dogaru <vlad.dogaru@intel.com>
  12516. + * Octavian Purdila <octavian.purdila@intel.com>
  12517. + * John Ronan <jronan@tssg.org>
  12518. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  12519. + * Brandon Heller <brandonh@stanford.edu>
  12520. + *
  12521. + *
  12522. + * This program is free software; you can redistribute it and/or
  12523. + * modify it under the terms of the GNU General Public License
  12524. + * as published by the Free Software Foundation; either version
  12525. + * 2 of the License, or (at your option) any later version.
  12526. + */
  12527. +
  12528. +#include <linux/export.h>
  12529. +#include <linux/in6.h>
  12530. +#include <linux/kernel.h>
  12531. +
  12532. +#include <net/addrconf.h>
  12533. +#include <net/flow.h>
  12534. +#include <net/inet6_connection_sock.h>
  12535. +#include <net/inet6_hashtables.h>
  12536. +#include <net/inet_common.h>
  12537. +#include <net/ipv6.h>
  12538. +#include <net/ip6_checksum.h>
  12539. +#include <net/ip6_route.h>
  12540. +#include <net/mptcp.h>
  12541. +#include <net/mptcp_v6.h>
  12542. +#include <net/tcp.h>
  12543. +#include <net/transp_v6.h>
  12544. +
  12545. +static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req,
  12546. + u16 queue_mapping);
  12547. +
  12548. +__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
  12549. + __be16 sport, __be16 dport, u32 seq)
  12550. +{
  12551. + u32 secret[MD5_MESSAGE_BYTES / 4];
  12552. + u32 hash[MD5_DIGEST_WORDS];
  12553. + u32 i;
  12554. +
  12555. + memcpy(hash, saddr, 16);
  12556. + for (i = 0; i < 4; i++)
  12557. + secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
  12558. + secret[4] = mptcp_secret[4] +
  12559. + (((__force u16)sport << 16) + (__force u16)dport);
  12560. + secret[5] = seq;
  12561. + for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
  12562. + secret[i] = mptcp_secret[i];
  12563. +
  12564. + md5_transform(hash, secret);
  12565. +
  12566. + return hash[0];
  12567. +}
  12568. +
  12569. +u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
  12570. + __be16 sport, __be16 dport)
  12571. +{
  12572. + u32 secret[MD5_MESSAGE_BYTES / 4];
  12573. + u32 hash[MD5_DIGEST_WORDS];
  12574. + u32 i;
  12575. +
  12576. + memcpy(hash, saddr, 16);
  12577. + for (i = 0; i < 4; i++)
  12578. + secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
  12579. + secret[4] = mptcp_secret[4] +
  12580. + (((__force u16)sport << 16) + (__force u16)dport);
  12581. + secret[5] = mptcp_key_seed++;
  12582. + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
  12583. + secret[i] = mptcp_secret[i];
  12584. +
  12585. + md5_transform(hash, secret);
  12586. +
  12587. + return *((u64 *)hash);
  12588. +}
  12589. +
  12590. +static void mptcp_v6_reqsk_destructor(struct request_sock *req)
  12591. +{
  12592. + mptcp_reqsk_destructor(req);
  12593. +
  12594. + tcp_v6_reqsk_destructor(req);
  12595. +}
  12596. +
  12597. +/* Similar to tcp_v6_rtx_synack */
  12598. +static int mptcp_v6_rtx_synack(struct sock *meta_sk, struct request_sock *req)
  12599. +{
  12600. + if (meta_sk->sk_family == AF_INET6)
  12601. + return tcp_v6_rtx_synack(meta_sk, req);
  12602. +
  12603. + TCP_INC_STATS_BH(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
  12604. + return mptcp_v6v4_send_synack(meta_sk, req, 0);
  12605. +}
  12606. +
  12607. +/* Similar to tcp6_request_sock_ops */
  12608. +struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
  12609. + .family = AF_INET6,
  12610. + .obj_size = sizeof(struct mptcp_request_sock),
  12611. + .rtx_syn_ack = mptcp_v6_rtx_synack,
  12612. + .send_ack = tcp_v6_reqsk_send_ack,
  12613. + .destructor = mptcp_v6_reqsk_destructor,
  12614. + .send_reset = tcp_v6_send_reset,
  12615. + .syn_ack_timeout = tcp_syn_ack_timeout,
  12616. +};
  12617. +
  12618. +static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,
  12619. + struct request_sock *req,
  12620. + unsigned long timeout)
  12621. +{
  12622. + const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
  12623. + inet_rsk(req)->ir_rmt_port,
  12624. + 0, MPTCP_HASH_SIZE);
  12625. + /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not
  12626. + * want to reset the keepalive-timer (responsible for retransmitting
  12627. + * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
  12628. + * overload the keepalive timer. Also, it's not a big deal, because the
  12629. + * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
  12630. + * if the third ACK gets lost, the client will handle the retransmission
  12631. + * anyways. If our SYN/ACK gets lost, the client will retransmit the
  12632. + * SYN.
  12633. + */
  12634. + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
  12635. + struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
  12636. + const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
  12637. + inet_rsk(req)->ir_rmt_port,
  12638. + lopt->hash_rnd, lopt->nr_table_entries);
  12639. +
  12640. + reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
  12641. + reqsk_queue_added(&meta_icsk->icsk_accept_queue);
  12642. +
  12643. + spin_lock(&mptcp_reqsk_hlock);
  12644. + list_add(&mptcp_rsk(req)->collide_tuple, &mptcp_reqsk_htb[h1]);
  12645. + spin_unlock(&mptcp_reqsk_hlock);
  12646. +}
  12647. +
  12648. +/* Similar to tcp_v6_send_synack
  12649. + *
  12650. + * The meta-socket is IPv4, but a new subsocket is IPv6
  12651. + */
  12652. +static int mptcp_v6v4_send_synack(struct sock *meta_sk, struct request_sock *req,
  12653. + u16 queue_mapping)
  12654. +{
  12655. + struct inet_request_sock *treq = inet_rsk(req);
  12656. + struct sk_buff *skb;
  12657. + struct flowi6 fl6;
  12658. + struct dst_entry *dst;
  12659. + int err = -ENOMEM;
  12660. +
  12661. + memset(&fl6, 0, sizeof(fl6));
  12662. + fl6.flowi6_proto = IPPROTO_TCP;
  12663. + fl6.daddr = treq->ir_v6_rmt_addr;
  12664. + fl6.saddr = treq->ir_v6_loc_addr;
  12665. + fl6.flowlabel = 0;
  12666. + fl6.flowi6_oif = treq->ir_iif;
  12667. + fl6.flowi6_mark = meta_sk->sk_mark;
  12668. + fl6.fl6_dport = inet_rsk(req)->ir_rmt_port;
  12669. + fl6.fl6_sport = htons(inet_rsk(req)->ir_num);
  12670. + security_req_classify_flow(req, flowi6_to_flowi(&fl6));
  12671. +
  12672. + dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL);
  12673. + if (IS_ERR(dst)) {
  12674. + err = PTR_ERR(dst);
  12675. + return err;
  12676. + }
  12677. + skb = tcp_make_synack(meta_sk, dst, req, NULL);
  12678. +
  12679. + if (skb) {
  12680. + __tcp_v6_send_check(skb, &treq->ir_v6_loc_addr,
  12681. + &treq->ir_v6_rmt_addr);
  12682. +
  12683. + fl6.daddr = treq->ir_v6_rmt_addr;
  12684. + skb_set_queue_mapping(skb, queue_mapping);
  12685. + err = ip6_xmit(meta_sk, skb, &fl6, NULL, 0);
  12686. + err = net_xmit_eval(err);
  12687. + }
  12688. +
  12689. + return err;
  12690. +}
  12691. +
  12692. +/* Similar to tcp_v6_syn_recv_sock
  12693. + *
  12694. + * The meta-socket is IPv4, but a new subsocket is IPv6
  12695. + */
  12696. +struct sock *mptcp_v6v4_syn_recv_sock(struct sock *meta_sk, struct sk_buff *skb,
  12697. + struct request_sock *req,
  12698. + struct dst_entry *dst)
  12699. +{
  12700. + struct inet_request_sock *treq;
  12701. + struct ipv6_pinfo *newnp;
  12702. + struct tcp6_sock *newtcp6sk;
  12703. + struct inet_sock *newinet;
  12704. + struct tcp_sock *newtp;
  12705. + struct sock *newsk;
  12706. +
  12707. + treq = inet_rsk(req);
  12708. +
  12709. + if (sk_acceptq_is_full(meta_sk))
  12710. + goto out_overflow;
  12711. +
  12712. + if (!dst) {
  12713. + /* This code is similar to inet6_csk_route_req, but as we
  12714. + * don't have a np-pointer in the meta, we have to do it
  12715. + * manually.
  12716. + */
  12717. + struct flowi6 fl6;
  12718. +
  12719. + memset(&fl6, 0, sizeof(fl6));
  12720. + fl6.flowi6_proto = IPPROTO_TCP;
  12721. + fl6.daddr = treq->ir_v6_rmt_addr;
  12722. + fl6.saddr = treq->ir_v6_loc_addr;
  12723. + fl6.flowi6_oif = treq->ir_iif;
  12724. + fl6.flowi6_mark = meta_sk->sk_mark;
  12725. + fl6.fl6_dport = inet_rsk(req)->ir_rmt_port;
  12726. + fl6.fl6_sport = htons(inet_rsk(req)->ir_num);
  12727. + security_req_classify_flow(req, flowi6_to_flowi(&fl6));
  12728. +
  12729. + dst = ip6_dst_lookup_flow(meta_sk, &fl6, NULL);
  12730. + if (IS_ERR(dst))
  12731. + goto out;
  12732. + }
  12733. +
  12734. + newsk = tcp_create_openreq_child(meta_sk, req, skb);
  12735. + if (newsk == NULL)
  12736. + goto out_nonewsk;
  12737. +
  12738. + /* Diff to tcp_v6_syn_recv_sock: Must do this prior to __ip6_dst_store,
  12739. + * as it tries to access the pinet6-pointer.
  12740. + */
  12741. + newtcp6sk = (struct tcp6_sock *)newsk;
  12742. + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
  12743. +
  12744. + /*
  12745. + * No need to charge this sock to the relevant IPv6 refcnt debug socks
  12746. + * count here, tcp_create_openreq_child now does this for us, see the
  12747. + * comment in that function for the gory details. -acme
  12748. + */
  12749. +
  12750. + newsk->sk_gso_type = SKB_GSO_TCPV6;
  12751. + __ip6_dst_store(newsk, dst, NULL, NULL);
  12752. + inet6_sk_rx_dst_set(newsk, skb);
  12753. +
  12754. + newtp = tcp_sk(newsk);
  12755. + newinet = inet_sk(newsk);
  12756. + newnp = inet6_sk(newsk);
  12757. +
  12758. + newsk->sk_v6_daddr = treq->ir_v6_rmt_addr;
  12759. + newnp->saddr = treq->ir_v6_loc_addr;
  12760. + newsk->sk_v6_rcv_saddr = treq->ir_v6_loc_addr;
  12761. + newsk->sk_bound_dev_if = treq->ir_iif;
  12762. +
  12763. + /* Now IPv6 options...
  12764. +
  12765. + First: no IPv4 options.
  12766. + */
  12767. + newinet->inet_opt = NULL;
  12768. + newnp->ipv6_ac_list = NULL;
  12769. + newnp->ipv6_fl_list = NULL;
  12770. + newnp->rxopt.all = 0;
  12771. +
  12772. + /* Clone pktoptions received with SYN */
  12773. + newnp->pktoptions = NULL;
  12774. + if (treq->pktopts != NULL) {
  12775. + newnp->pktoptions = skb_clone(treq->pktopts,
  12776. + sk_gfp_atomic(meta_sk, GFP_ATOMIC));
  12777. + consume_skb(treq->pktopts);
  12778. + treq->pktopts = NULL;
  12779. + if (newnp->pktoptions)
  12780. + skb_set_owner_r(newnp->pktoptions, newsk);
  12781. + }
  12782. + newnp->opt = NULL;
  12783. + newnp->mcast_oif = inet6_iif(skb);
  12784. + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
  12785. + newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
  12786. +
  12787. + /* Initialization copied from inet6_create - normally this should have
  12788. + * been handled by the memcpy as in tcp_v6_syn_recv_sock
  12789. + */
  12790. + newnp->hop_limit = -1;
  12791. + newnp->mc_loop = 1;
  12792. + newnp->pmtudisc = IPV6_PMTUDISC_WANT;
  12793. + (void)xchg(&newnp->rxpmtu, NULL);
  12794. +
  12795. + inet_csk(newsk)->icsk_ext_hdr_len = 0;
  12796. +
  12797. + tcp_mtup_init(newsk);
  12798. + tcp_sync_mss(newsk, dst_mtu(dst));
  12799. + newtp->advmss = dst_metric_advmss(dst);
  12800. + if (tcp_sk(meta_sk)->rx_opt.user_mss &&
  12801. + tcp_sk(meta_sk)->rx_opt.user_mss < newtp->advmss)
  12802. + newtp->advmss = tcp_sk(meta_sk)->rx_opt.user_mss;
  12803. +
  12804. + tcp_initialize_rcv_mss(newsk);
  12805. +
  12806. + newinet->inet_daddr = LOOPBACK4_IPV6;
  12807. + newinet->inet_saddr = LOOPBACK4_IPV6;
  12808. + newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
  12809. +
  12810. + if (__inet_inherit_port(meta_sk, newsk) < 0) {
  12811. + inet_csk_prepare_forced_close(newsk);
  12812. + tcp_done(newsk);
  12813. + goto out;
  12814. + }
  12815. + __inet6_hash(newsk, NULL);
  12816. +
  12817. + return newsk;
  12818. +
  12819. +out_overflow:
  12820. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENOVERFLOWS);
  12821. +out_nonewsk:
  12822. + dst_release(dst);
  12823. +out:
  12824. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_LISTENDROPS);
  12825. + return NULL;
  12826. +}
  12827. +
  12828. +/* Similar to tcp_v6_conn_request */
  12829. +static void mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
  12830. +{
  12831. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  12832. + struct tcp_options_received tmp_opt;
  12833. + struct mptcp_options_received mopt;
  12834. + struct ipv6_pinfo *np = inet6_sk(meta_sk);
  12835. + struct request_sock *req;
  12836. + struct inet_request_sock *treq;
  12837. + struct mptcp_request_sock *mtreq;
  12838. + u8 mptcp_hash_mac[20];
  12839. + __u32 isn = TCP_SKB_CB(skb)->when;
  12840. + struct dst_entry *dst = NULL;
  12841. + struct flowi6 fl6;
  12842. + int want_cookie = 0;
  12843. + union inet_addr addr;
  12844. +
  12845. + tcp_clear_options(&tmp_opt);
  12846. + mptcp_init_mp_opt(&mopt);
  12847. + tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
  12848. + tmp_opt.user_mss = tcp_sk(meta_sk)->rx_opt.user_mss;
  12849. + tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
  12850. +
  12851. + req = inet6_reqsk_alloc(&mptcp6_request_sock_ops);
  12852. + if (!req)
  12853. + return;
  12854. +
  12855. +#ifdef CONFIG_TCP_MD5SIG
  12856. + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
  12857. +#endif
  12858. +
  12859. + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
  12860. + tcp_openreq_init(req, &tmp_opt, skb);
  12861. +
  12862. + treq = inet_rsk(req);
  12863. + treq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
  12864. + treq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
  12865. +
  12866. + if (!want_cookie || tmp_opt.tstamp_ok)
  12867. + TCP_ECN_create_request(req, skb, sock_net(meta_sk));
  12868. +
  12869. + treq->ir_iif = meta_sk->sk_bound_dev_if;
  12870. +
  12871. + /* So that link locals have meaning */
  12872. + if (!meta_sk->sk_bound_dev_if &&
  12873. + ipv6_addr_type(&treq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
  12874. + treq->ir_iif = inet6_iif(skb);
  12875. +
  12876. + if (!isn) {
  12877. + if (meta_sk->sk_family == AF_INET6 &&
  12878. + (ipv6_opt_accepted(meta_sk, skb) ||
  12879. + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
  12880. + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)) {
  12881. + atomic_inc(&skb->users);
  12882. + treq->pktopts = skb;
  12883. + }
  12884. +
  12885. + /* VJ's idea. We save last timestamp seen
  12886. + * from the destination in peer table, when entering
  12887. + * state TIME-WAIT, and check against it before
  12888. + * accepting new connection request.
  12889. + *
  12890. + * If "isn" is not zero, this request hit alive
  12891. + * timewait bucket, so that all the necessary checks
  12892. + * are made in the function processing timewait state.
  12893. + */
  12894. + if (tmp_opt.saw_tstamp &&
  12895. + tcp_death_row.sysctl_tw_recycle &&
  12896. + (dst = inet6_csk_route_req(meta_sk, &fl6, req)) != NULL) {
  12897. + if (!tcp_peer_is_proven(req, dst, true)) {
  12898. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_PAWSPASSIVEREJECTED);
  12899. + goto drop_and_release;
  12900. + }
  12901. + }
  12902. + /* Kill the following clause, if you dislike this way. */
  12903. + else if (!sysctl_tcp_syncookies &&
  12904. + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(meta_sk) <
  12905. + (sysctl_max_syn_backlog >> 2)) &&
  12906. + !tcp_peer_is_proven(req, dst, false)) {
  12907. + /* Without syncookies last quarter of
  12908. + * backlog is filled with destinations,
  12909. + * proven to be alive.
  12910. + * It means that we continue to communicate
  12911. + * to destinations, already remembered
  12912. + * to the moment of synflood.
  12913. + */
  12914. + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
  12915. + &treq->ir_v6_rmt_addr,
  12916. + ntohs(tcp_hdr(skb)->source));
  12917. + goto drop_and_release;
  12918. + }
  12919. +
  12920. + isn = tcp_v6_init_sequence(skb);
  12921. + }
  12922. +
  12923. + tcp_rsk(req)->snt_isn = isn;
  12924. + tcp_rsk(req)->snt_synack = tcp_time_stamp;
  12925. + tcp_rsk(req)->listener = NULL;
  12926. +
  12927. + mtreq = mptcp_rsk(req);
  12928. + mtreq->mpcb = mpcb;
  12929. + INIT_LIST_HEAD(&mtreq->collide_tuple);
  12930. + mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
  12931. + mtreq->mptcp_rem_key = mpcb->mptcp_rem_key;
  12932. + mtreq->mptcp_loc_key = mpcb->mptcp_loc_key;
  12933. + mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->daddr.s6_addr32,
  12934. + ipv6_hdr(skb)->saddr.s6_addr32,
  12935. + tcp_hdr(skb)->dest,
  12936. + tcp_hdr(skb)->source, isn);
  12937. + mptcp_hmac_sha1((u8 *)&mtreq->mptcp_loc_key,
  12938. + (u8 *)&mtreq->mptcp_rem_key,
  12939. + (u8 *)&mtreq->mptcp_loc_nonce,
  12940. + (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
  12941. + mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
  12942. +
  12943. + addr.in6 = treq->ir_v6_loc_addr;
  12944. + mtreq->loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(meta_sk));
  12945. + if (mtreq->loc_id == -1) /* Address not part of the allowed ones */
  12946. + goto drop_and_release;
  12947. + mtreq->rem_id = mopt.rem_id;
  12948. + mtreq->low_prio = mopt.low_prio;
  12949. + tcp_rsk(req)->saw_mpc = 1;
  12950. +
  12951. + if (meta_sk->sk_family == AF_INET6) {
  12952. + if (tcp_v6_send_synack(meta_sk, dst, &fl6, req,
  12953. + skb_get_queue_mapping(skb)))
  12954. + goto drop_and_free;
  12955. + } else {
  12956. + if (mptcp_v6v4_send_synack(meta_sk, req, skb_get_queue_mapping(skb)))
  12957. + goto drop_and_free;
  12958. + }
  12959. +
  12960. + /* Adding to request queue in metasocket */
  12961. + mptcp_v6_reqsk_queue_hash_add(meta_sk, req, TCP_TIMEOUT_INIT);
  12962. +
  12963. + return;
  12964. +
  12965. +drop_and_release:
  12966. + dst_release(dst);
  12967. +drop_and_free:
  12968. + reqsk_free(req);
  12969. + return;
  12970. +}
  12971. +
  12972. +int mptcp_v6_rem_raddress(struct mptcp_cb *mpcb, u8 id)
  12973. +{
  12974. + int i;
  12975. +
  12976. + for (i = 0; i < MPTCP_MAX_ADDR; i++) {
  12977. + if (!((1 << i) & mpcb->rem6_bits))
  12978. + continue;
  12979. +
  12980. + if (mpcb->remaddr6[i].rem6_id == id) {
  12981. + /* remove address from bitfield */
  12982. + mpcb->rem6_bits &= ~(1 << i);
  12983. +
  12984. + return 0;
  12985. + }
  12986. + }
  12987. +
  12988. + return -1;
  12989. +}
  12990. +
  12991. +/* Returns -1 if there is no space anymore to store an additional
  12992. + * address
  12993. + */
  12994. +int mptcp_v6_add_raddress(struct mptcp_cb *mpcb, const struct in6_addr *addr,
  12995. + __be16 port, u8 id)
  12996. +{
  12997. + int i;
  12998. + struct mptcp_rem6 *rem6;
  12999. +
  13000. + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
  13001. + rem6 = &mpcb->remaddr6[i];
  13002. +
  13003. + /* Address is already in the list --- continue */
  13004. + if (rem6->rem6_id == id &&
  13005. + ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
  13006. + return 0;
  13007. +
  13008. + /* This may be the case, when the peer is behind a NAT. He is
  13009. + * trying to JOIN, thus sending the JOIN with a certain ID.
  13010. + * However the src_addr of the IP-packet has been changed. We
  13011. + * update the addr in the list, because this is the address as
  13012. + * OUR BOX sees it.
  13013. + */
  13014. + if (rem6->rem6_id == id) {
  13015. + /* update the address */
  13016. + mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
  13017. + __func__, &rem6->addr, addr, id);
  13018. + rem6->addr = *addr;
  13019. + rem6->port = port;
  13020. + mpcb->list_rcvd = 1;
  13021. + return 0;
  13022. + }
  13023. + }
  13024. +
  13025. + i = mptcp_find_free_index(mpcb->rem6_bits);
  13026. + /* Do we have already the maximum number of local/remote addresses? */
  13027. + if (i < 0) {
  13028. + mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
  13029. + __func__, MPTCP_MAX_ADDR, addr);
  13030. + return -1;
  13031. + }
  13032. +
  13033. + rem6 = &mpcb->remaddr6[i];
  13034. +
  13035. + /* Address is not known yet, store it */
  13036. + rem6->addr = *addr;
  13037. + rem6->port = port;
  13038. + rem6->bitfield = 0;
  13039. + rem6->retry_bitfield = 0;
  13040. + rem6->rem6_id = id;
  13041. + mpcb->list_rcvd = 1;
  13042. + mpcb->rem6_bits |= (1 << i);
  13043. +
  13044. + return 0;
  13045. +}
  13046. +
  13047. +/* Sets the bitfield of the remote-address field
  13048. + * local address is not set as it will disappear with the global address-list
  13049. + */
  13050. +void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
  13051. + const struct in6_addr *daddr, int index)
  13052. +{
  13053. + int i;
  13054. + mptcp_for_each_bit_set(mpcb->rem6_bits, i) {
  13055. + if (ipv6_addr_equal(&mpcb->remaddr6[i].addr, daddr)) {
  13056. + mpcb->remaddr6[i].bitfield |= (1 << index);
  13057. + return;
  13058. + }
  13059. + }
  13060. +}
  13061. +
  13062. +int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
  13063. +{
  13064. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  13065. + struct sock *child, *rsk = NULL;
  13066. + int ret;
  13067. +
  13068. + if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
  13069. + struct tcphdr *th = tcp_hdr(skb);
  13070. + const struct ipv6hdr *ip6h = ipv6_hdr(skb);
  13071. + struct sock *sk;
  13072. +
  13073. + sk = __inet6_lookup_established(sock_net(meta_sk),
  13074. + &tcp_hashinfo,
  13075. + &ip6h->saddr, th->source,
  13076. + &ip6h->daddr, ntohs(th->dest),
  13077. + inet6_iif(skb));
  13078. +
  13079. + if (!sk) {
  13080. + kfree_skb(skb);
  13081. + return 0;
  13082. + }
  13083. + if (is_meta_sk(sk)) {
  13084. + WARN("%s Did not find a sub-sk!\n", __func__);
  13085. + kfree_skb(skb);
  13086. + sock_put(sk);
  13087. + return 0;
  13088. + }
  13089. +
  13090. + if (sk->sk_state == TCP_TIME_WAIT) {
  13091. + inet_twsk_put(inet_twsk(sk));
  13092. + kfree_skb(skb);
  13093. + return 0;
  13094. + }
  13095. +
  13096. + ret = tcp_v6_do_rcv(sk, skb);
  13097. + sock_put(sk);
  13098. +
  13099. + return ret;
  13100. + }
  13101. + TCP_SKB_CB(skb)->mptcp_flags = 0;
  13102. +
  13103. + /* Has been removed from the tk-table. Thus, no new subflows.
  13104. + *
  13105. + * Check for close-state is necessary, because we may have been closed
  13106. + * without passing by mptcp_close().
  13107. + *
  13108. + * When falling back, no new subflows are allowed either.
  13109. + */
  13110. + if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
  13111. + mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
  13112. + goto reset_and_discard;
  13113. +
  13114. + child = tcp_v6_hnd_req(meta_sk, skb);
  13115. +
  13116. + if (!child)
  13117. + goto discard;
  13118. +
  13119. + if (child != meta_sk) {
  13120. + sock_rps_save_rxhash(child, skb);
  13121. + /* We don't call tcp_child_process here, because we hold
  13122. + * already the meta-sk-lock and are sure that it is not owned
  13123. + * by the user.
  13124. + */
  13125. + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
  13126. + bh_unlock_sock(child);
  13127. + sock_put(child);
  13128. + if (ret) {
  13129. + rsk = child;
  13130. + goto reset_and_discard;
  13131. + }
  13132. + } else {
  13133. + if (tcp_hdr(skb)->syn) {
  13134. + struct mp_join *join_opt = mptcp_find_join(skb);
  13135. + /* Currently we make two calls to mptcp_find_join(). This
  13136. + * can probably be optimized. */
  13137. + if (mptcp_v6_add_raddress(mpcb,
  13138. + (struct in6_addr *)&ipv6_hdr(skb)->saddr,
  13139. + 0,
  13140. + join_opt->addr_id) < 0)
  13141. + goto reset_and_discard;
  13142. + mpcb->list_rcvd = 0;
  13143. +
  13144. + mptcp_v6_join_request(meta_sk, skb);
  13145. + goto discard;
  13146. + }
  13147. + goto reset_and_discard;
  13148. + }
  13149. + return 0;
  13150. +
  13151. +reset_and_discard:
  13152. + tcp_v6_send_reset(rsk, skb);
  13153. +discard:
  13154. + kfree_skb(skb);
  13155. + return 0;
  13156. +}
  13157. +
  13158. +/* After this, the ref count of the meta_sk associated with the request_sock
  13159. + * is incremented. Thus it is the responsibility of the caller
  13160. + * to call sock_put() when the reference is not needed anymore.
  13161. + */
  13162. +struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
  13163. + const struct in6_addr *laddr, const struct net *net)
  13164. +{
  13165. + struct mptcp_request_sock *mtreq;
  13166. + struct sock *meta_sk = NULL;
  13167. +
  13168. + spin_lock(&mptcp_reqsk_hlock);
  13169. + list_for_each_entry(mtreq,
  13170. + &mptcp_reqsk_htb[inet6_synq_hash(raddr, rport, 0,
  13171. + MPTCP_HASH_SIZE)],
  13172. + collide_tuple) {
  13173. + struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));
  13174. + meta_sk = mtreq->mpcb->meta_sk;
  13175. +
  13176. + if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&
  13177. + rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&
  13178. + ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&
  13179. + ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&
  13180. + net_eq(net, sock_net(meta_sk)))
  13181. + break;
  13182. + meta_sk = NULL;
  13183. + }
  13184. +
  13185. + if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
  13186. + meta_sk = NULL;
  13187. + spin_unlock(&mptcp_reqsk_hlock);
  13188. +
  13189. + return meta_sk;
  13190. +}
  13191. +
  13192. +/* Create a new IPv6 subflow.
  13193. + *
  13194. + * We are in user-context and meta-sock-lock is hold.
  13195. + */
  13196. +int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
  13197. + struct mptcp_rem6 *rem)
  13198. +{
  13199. + struct tcp_sock *tp;
  13200. + struct sock *sk;
  13201. + struct sockaddr_in6 loc_in, rem_in;
  13202. + struct socket sock;
  13203. + int ulid_size = 0, ret;
  13204. +
  13205. + /** First, create and prepare the new socket */
  13206. +
  13207. + sock.type = meta_sk->sk_socket->type;
  13208. + sock.state = SS_UNCONNECTED;
  13209. + sock.wq = meta_sk->sk_socket->wq;
  13210. + sock.file = meta_sk->sk_socket->file;
  13211. + sock.ops = NULL;
  13212. +
  13213. + ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
  13214. + if (unlikely(ret < 0)) {
  13215. + mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
  13216. + return ret;
  13217. + }
  13218. +
  13219. + sk = sock.sk;
  13220. + tp = tcp_sk(sk);
  13221. +
  13222. + /* All subsockets need the MPTCP-lock-class */
  13223. + lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
  13224. + lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
  13225. +
  13226. + if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))
  13227. + goto error;
  13228. +
  13229. + tp->mptcp->slave_sk = 1;
  13230. + tp->mptcp->low_prio = loc->low_prio;
  13231. +
  13232. + /* Initializing the timer for an MPTCP subflow */
  13233. + setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
  13234. +
  13235. + /** Then, connect the socket to the peer */
  13236. +
  13237. + ulid_size = sizeof(struct sockaddr_in6);
  13238. + loc_in.sin6_family = AF_INET6;
  13239. + rem_in.sin6_family = AF_INET6;
  13240. + loc_in.sin6_port = 0;
  13241. + if (rem->port)
  13242. + rem_in.sin6_port = rem->port;
  13243. + else
  13244. + rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
  13245. + loc_in.sin6_addr = loc->addr;
  13246. + rem_in.sin6_addr = rem->addr;
  13247. +
  13248. + ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, ulid_size);
  13249. + if (ret < 0) {
  13250. + mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
  13251. + __func__, ret);
  13252. + goto error;
  13253. + }
  13254. +
  13255. + mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",
  13256. + __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
  13257. + tp->mptcp->path_index, &loc_in.sin6_addr,
  13258. + ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
  13259. + ntohs(rem_in.sin6_port));
  13260. +
  13261. + ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
  13262. + ulid_size, O_NONBLOCK);
  13263. + if (ret < 0 && ret != -EINPROGRESS) {
  13264. + mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
  13265. + __func__, ret);
  13266. + goto error;
  13267. + }
  13268. +
  13269. + sk_set_socket(sk, meta_sk->sk_socket);
  13270. + sk->sk_wq = meta_sk->sk_wq;
  13271. +
  13272. + return 0;
  13273. +
  13274. +error:
  13275. + /* May happen if mptcp_add_sock fails first */
  13276. + if (!tp->mpc) {
  13277. + tcp_close(sk, 0);
  13278. + } else {
  13279. + local_bh_disable();
  13280. + mptcp_sub_force_close(sk);
  13281. + local_bh_enable();
  13282. + }
  13283. + return ret;
  13284. +}
  13285. +EXPORT_SYMBOL(mptcp_init6_subsockets);
  13286. +
  13287. +int mptcp_pm_v6_init(void)
  13288. +{
  13289. + int ret = 0;
  13290. + struct request_sock_ops *ops = &mptcp6_request_sock_ops;
  13291. +
  13292. + ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
  13293. + if (ops->slab_name == NULL) {
  13294. + ret = -ENOMEM;
  13295. + goto out;
  13296. + }
  13297. +
  13298. + ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
  13299. + SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
  13300. + NULL);
  13301. +
  13302. + if (ops->slab == NULL) {
  13303. + ret = -ENOMEM;
  13304. + goto err_reqsk_create;
  13305. + }
  13306. +
  13307. +out:
  13308. + return ret;
  13309. +
  13310. +err_reqsk_create:
  13311. + kfree(ops->slab_name);
  13312. + ops->slab_name = NULL;
  13313. + goto out;
  13314. +}
  13315. +
  13316. +void mptcp_pm_v6_undo(void)
  13317. +{
  13318. + kmem_cache_destroy(mptcp6_request_sock_ops.slab);
  13319. + kfree(mptcp6_request_sock_ops.slab_name);
  13320. +}
  13321. diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c
  13322. new file mode 100644
  13323. index 0000000..a126325
  13324. --- /dev/null
  13325. +++ b/net/mptcp/mptcp_ndiffports.c
  13326. @@ -0,0 +1,171 @@
  13327. +#include <linux/module.h>
  13328. +
  13329. +#include <net/mptcp.h>
  13330. +#include <net/mptcp_v4.h>
  13331. +
  13332. +#if IS_ENABLED(CONFIG_IPV6)
  13333. +#include <net/mptcp_v6.h>
  13334. +#endif
  13335. +
  13336. +struct ndiffports_priv {
  13337. + /* Worker struct for subflow establishment */
  13338. + struct work_struct subflow_work;
  13339. +
  13340. + struct mptcp_cb *mpcb;
  13341. +};
  13342. +
  13343. +static int sysctl_mptcp_ndiffports __read_mostly = 2;
  13344. +
  13345. +/**
  13346. + * Create all new subflows, by doing calls to mptcp_initX_subsockets
  13347. + *
  13348. + * This function uses a goto next_subflow, to allow releasing the lock between
  13349. + * new subflows and giving other processes a chance to do some work on the
  13350. + * socket and potentially finishing the communication.
  13351. + **/
  13352. +static void create_subflow_worker(struct work_struct *work)
  13353. +{
  13354. + struct ndiffports_priv *pm_priv = container_of(work,
  13355. + struct ndiffports_priv,
  13356. + subflow_work);
  13357. + struct mptcp_cb *mpcb = pm_priv->mpcb;
  13358. + struct sock *meta_sk = mpcb->meta_sk;
  13359. + int iter = 0;
  13360. +
  13361. +next_subflow:
  13362. + if (iter) {
  13363. + release_sock(meta_sk);
  13364. + mutex_unlock(&mpcb->mpcb_mutex);
  13365. +
  13366. + yield();
  13367. + }
  13368. + mutex_lock(&mpcb->mpcb_mutex);
  13369. + lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
  13370. +
  13371. + iter++;
  13372. +
  13373. + if (sock_flag(meta_sk, SOCK_DEAD))
  13374. + goto exit;
  13375. +
  13376. + if (mpcb->master_sk &&
  13377. + !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
  13378. + goto exit;
  13379. +
  13380. + if (sysctl_mptcp_ndiffports > iter &&
  13381. + sysctl_mptcp_ndiffports > mpcb->cnt_subflows) {
  13382. + if (meta_sk->sk_family == AF_INET ||
  13383. + mptcp_v6_is_v4_mapped(meta_sk)) {
  13384. + struct mptcp_loc4 loc;
  13385. +
  13386. + loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
  13387. + loc.loc4_id = 0;
  13388. + loc.low_prio = 0;
  13389. +
  13390. + mptcp_init4_subsockets(meta_sk, &loc, &mpcb->remaddr4[0]);
  13391. + } else {
  13392. +#if IS_ENABLED(CONFIG_IPV6)
  13393. + struct mptcp_loc6 loc;
  13394. +
  13395. + loc.addr = inet6_sk(meta_sk)->saddr;
  13396. + loc.loc6_id = 0;
  13397. + loc.low_prio = 0;
  13398. +
  13399. + mptcp_init6_subsockets(meta_sk, &loc, &mpcb->remaddr6[0]);
  13400. +#endif
  13401. + }
  13402. + goto next_subflow;
  13403. + }
  13404. +
  13405. +exit:
  13406. + release_sock(meta_sk);
  13407. + mutex_unlock(&mpcb->mpcb_mutex);
  13408. + sock_put(meta_sk);
  13409. +}
  13410. +
  13411. +static void ndiffports_new_session(struct sock *meta_sk, int index)
  13412. +{
  13413. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  13414. + struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
  13415. +
  13416. + /* Initialize workqueue-struct */
  13417. + INIT_WORK(&fmp->subflow_work, create_subflow_worker);
  13418. + fmp->mpcb = mpcb;
  13419. +}
  13420. +
  13421. +static void ndiffports_create_subflows(struct sock *meta_sk)
  13422. +{
  13423. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  13424. + struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
  13425. +
  13426. + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
  13427. + mpcb->send_infinite_mapping ||
  13428. + mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
  13429. + return;
  13430. +
  13431. + if (!work_pending(&pm_priv->subflow_work)) {
  13432. + sock_hold(meta_sk);
  13433. + queue_work(mptcp_wq, &pm_priv->subflow_work);
  13434. + }
  13435. +}
  13436. +
  13437. +static int ndiffports_get_local_index(sa_family_t family, union inet_addr *addr,
  13438. + struct net *net)
  13439. +{
  13440. + return 0;
  13441. +}
  13442. +
  13443. +static struct mptcp_pm_ops ndiffports __read_mostly = {
  13444. + .new_session = ndiffports_new_session,
  13445. + .fully_established = ndiffports_create_subflows,
  13446. + .get_local_index = ndiffports_get_local_index,
  13447. + .get_local_id = ndiffports_get_local_index,
  13448. + .name = "ndiffports",
  13449. + .owner = THIS_MODULE,
  13450. +};
  13451. +
  13452. +static struct ctl_table ndiff_table[] = {
  13453. + {
  13454. + .procname = "mptcp_ndiffports",
  13455. + .data = &sysctl_mptcp_ndiffports,
  13456. + .maxlen = sizeof(int),
  13457. + .mode = 0644,
  13458. + .proc_handler = &proc_dointvec
  13459. + },
  13460. + { }
  13461. +};
  13462. +
  13463. +struct ctl_table_header *mptcp_sysctl;
  13464. +
  13465. +/* General initialization of MPTCP_PM */
  13466. +static int __init ndiffports_register(void)
  13467. +{
  13468. + BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
  13469. +
  13470. + mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", ndiff_table);
  13471. + if (!mptcp_sysctl)
  13472. + goto exit;
  13473. +
  13474. + if (mptcp_register_path_manager(&ndiffports))
  13475. + goto pm_failed;
  13476. +
  13477. + return 0;
  13478. +
  13479. +pm_failed:
  13480. + unregister_net_sysctl_table(mptcp_sysctl);
  13481. +exit:
  13482. + return -1;
  13483. +}
  13484. +
  13485. +static void ndiffports_unregister(void)
  13486. +{
  13487. + mptcp_unregister_path_manager(&ndiffports);
  13488. + unregister_net_sysctl_table(mptcp_sysctl);
  13489. +}
  13490. +
  13491. +module_init(ndiffports_register);
  13492. +module_exit(ndiffports_unregister);
  13493. +
  13494. +MODULE_AUTHOR("Christoph Paasch");
  13495. +MODULE_LICENSE("GPL");
  13496. +MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
  13497. +MODULE_VERSION("0.88");
  13498. diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c
  13499. new file mode 100644
  13500. index 0000000..e182855
  13501. --- /dev/null
  13502. +++ b/net/mptcp/mptcp_ofo_queue.c
  13503. @@ -0,0 +1,278 @@
  13504. +/*
  13505. + * MPTCP implementation - Fast algorithm for MPTCP meta-reordering
  13506. + *
  13507. + * Initial Design & Implementation:
  13508. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  13509. + *
  13510. + * Current Maintainer & Author:
  13511. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  13512. + *
  13513. + * Additional authors:
  13514. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  13515. + * Gregory Detal <gregory.detal@uclouvain.be>
  13516. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  13517. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  13518. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  13519. + * Andreas Ripke <ripke@neclab.eu>
  13520. + * Vlad Dogaru <vlad.dogaru@intel.com>
  13521. + * Octavian Purdila <octavian.purdila@intel.com>
  13522. + * John Ronan <jronan@tssg.org>
  13523. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  13524. + * Brandon Heller <brandonh@stanford.edu>
  13525. + *
  13526. + * This program is free software; you can redistribute it and/or
  13527. + * modify it under the terms of the GNU General Public License
  13528. + * as published by the Free Software Foundation; either version
  13529. + * 2 of the License, or (at your option) any later version.
  13530. + */
  13531. +
  13532. +#include <linux/skbuff.h>
  13533. +#include <linux/slab.h>
  13534. +#include <net/tcp.h>
  13535. +#include <net/mptcp.h>
  13536. +
  13537. +void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
  13538. + const struct sk_buff *skb)
  13539. +{
  13540. + struct tcp_sock *tp;
  13541. +
  13542. + mptcp_for_each_tp(mpcb, tp) {
  13543. + if (tp->mptcp->shortcut_ofoqueue == skb) {
  13544. + tp->mptcp->shortcut_ofoqueue = NULL;
  13545. + return;
  13546. + }
  13547. + }
  13548. +}
  13549. +
  13550. +/* Does 'skb' fits after 'here' in the queue 'head' ?
  13551. + * If yes, we queue it and return 1
  13552. + */
  13553. +static int mptcp_ofo_queue_after(struct sk_buff_head *head,
  13554. + struct sk_buff *skb, struct sk_buff *here,
  13555. + struct tcp_sock *tp)
  13556. +{
  13557. + struct sock *meta_sk = tp->meta_sk;
  13558. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  13559. + u32 seq = TCP_SKB_CB(skb)->seq;
  13560. + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  13561. +
  13562. + /* We want to queue skb after here, thus seq >= end_seq */
  13563. + if (before(seq, TCP_SKB_CB(here)->end_seq))
  13564. + return 0;
  13565. +
  13566. + if (seq == TCP_SKB_CB(here)->end_seq) {
  13567. + bool fragstolen = false;
  13568. +
  13569. + if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {
  13570. + __skb_queue_after(&meta_tp->out_of_order_queue, here, skb);
  13571. + return 1;
  13572. + } else {
  13573. + kfree_skb_partial(skb, fragstolen);
  13574. + return -1;
  13575. + }
  13576. + }
  13577. +
  13578. + /* If here is the last one, we can always queue it */
  13579. + if (skb_queue_is_last(head, here)) {
  13580. + __skb_queue_after(head, here, skb);
  13581. + return 1;
  13582. + } else {
  13583. + struct sk_buff *skb1 = skb_queue_next(head, here);
  13584. + /* It's not the last one, but does it fits between 'here' and
  13585. + * the one after 'here' ? Thus, does end_seq <= after_here->seq
  13586. + */
  13587. + if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {
  13588. + __skb_queue_after(head, here, skb);
  13589. + return 1;
  13590. + }
  13591. + }
  13592. +
  13593. + return 0;
  13594. +}
  13595. +
  13596. +static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,
  13597. + struct sk_buff_head *head, struct tcp_sock *tp)
  13598. +{
  13599. + struct sock *meta_sk = tp->meta_sk;
  13600. + struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);
  13601. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  13602. + struct sk_buff *skb1, *best_shortcut = NULL;
  13603. + u32 seq = TCP_SKB_CB(skb)->seq;
  13604. + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  13605. + u32 distance = 0xffffffff;
  13606. +
  13607. + /* First, check the tp's shortcut */
  13608. + if (!shortcut) {
  13609. + if (skb_queue_empty(head)) {
  13610. + __skb_queue_head(head, skb);
  13611. + goto end;
  13612. + }
  13613. + } else {
  13614. + int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
  13615. + /* Does the tp's shortcut is a hit? If yes, we insert. */
  13616. +
  13617. + if (ret) {
  13618. + skb = (ret > 0) ? skb : NULL;
  13619. + goto end;
  13620. + }
  13621. + }
  13622. +
  13623. + /* Check the shortcuts of the other subsockets. */
  13624. + mptcp_for_each_tp(mpcb, tp_it) {
  13625. + shortcut = tp_it->mptcp->shortcut_ofoqueue;
  13626. + /* Can we queue it here? If yes, do so! */
  13627. + if (shortcut) {
  13628. + int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
  13629. +
  13630. + if (ret) {
  13631. + skb = (ret > 0) ? skb : NULL;
  13632. + goto end;
  13633. + }
  13634. + }
  13635. +
  13636. + /* Could not queue it, check if we are close.
  13637. + * We are looking for a shortcut, close enough to seq to
  13638. + * set skb1 prematurely and thus improve the subsequent lookup,
  13639. + * which tries to find a skb1 so that skb1->seq <= seq.
  13640. + *
  13641. + * So, here we only take shortcuts, whose shortcut->seq > seq,
  13642. + * and minimize the distance between shortcut->seq and seq and
  13643. + * set best_shortcut to this one with the minimal distance.
  13644. + *
  13645. + * That way, the subsequent while-loop is shortest.
  13646. + */
  13647. + if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {
  13648. + /* Are we closer than the current best shortcut? */
  13649. + if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {
  13650. + distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);
  13651. + best_shortcut = shortcut;
  13652. + }
  13653. + }
  13654. + }
  13655. +
  13656. + if (best_shortcut)
  13657. + skb1 = best_shortcut;
  13658. + else
  13659. + skb1 = skb_peek_tail(head);
  13660. +
  13661. + if (seq == TCP_SKB_CB(skb1)->end_seq) {
  13662. + bool fragstolen = false;
  13663. +
  13664. + if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {
  13665. + __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);
  13666. + } else {
  13667. + kfree_skb_partial(skb, fragstolen);
  13668. + skb = NULL;
  13669. + }
  13670. +
  13671. + goto end;
  13672. + }
  13673. +
  13674. + /* Find the insertion point, starting from best_shortcut if available.
  13675. + *
  13676. + * Inspired from tcp_data_queue_ofo.
  13677. + */
  13678. + while (1) {
  13679. + /* skb1->seq <= seq */
  13680. + if (!after(TCP_SKB_CB(skb1)->seq, seq))
  13681. + break;
  13682. + if (skb_queue_is_first(head, skb1)) {
  13683. + skb1 = NULL;
  13684. + break;
  13685. + }
  13686. + skb1 = skb_queue_prev(head, skb1);
  13687. + }
  13688. +
  13689. + /* Do skb overlap to previous one? */
  13690. + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
  13691. + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
  13692. + /* All the bits are present. */
  13693. + __kfree_skb(skb);
  13694. + skb = NULL;
  13695. + goto end;
  13696. + }
  13697. + if (seq == TCP_SKB_CB(skb1)->seq) {
  13698. + if (skb_queue_is_first(head, skb1))
  13699. + skb1 = NULL;
  13700. + else
  13701. + skb1 = skb_queue_prev(head, skb1);
  13702. + }
  13703. + }
  13704. + if (!skb1)
  13705. + __skb_queue_head(head, skb);
  13706. + else
  13707. + __skb_queue_after(head, skb1, skb);
  13708. +
  13709. + /* And clean segments covered by new one as whole. */
  13710. + while (!skb_queue_is_last(head, skb)) {
  13711. + skb1 = skb_queue_next(head, skb);
  13712. +
  13713. + if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
  13714. + break;
  13715. +
  13716. + __skb_unlink(skb1, head);
  13717. + mptcp_remove_shortcuts(mpcb, skb1);
  13718. + __kfree_skb(skb1);
  13719. + }
  13720. +
  13721. +end:
  13722. + if (skb) {
  13723. + skb_set_owner_r(skb, meta_sk);
  13724. + tp->mptcp->shortcut_ofoqueue = skb;
  13725. + }
  13726. +
  13727. + return;
  13728. +}
  13729. +
  13730. +/**
  13731. + * @sk: the subflow that received this skb.
  13732. + */
  13733. +void mptcp_add_meta_ofo_queue(struct sock *meta_sk, struct sk_buff *skb,
  13734. + struct sock *sk)
  13735. +{
  13736. + struct tcp_sock *tp = tcp_sk(sk);
  13737. +
  13738. + try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,
  13739. + &tcp_sk(meta_sk)->out_of_order_queue, tp);
  13740. +}
  13741. +
  13742. +void mptcp_ofo_queue(struct sock *meta_sk)
  13743. +{
  13744. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  13745. + struct sk_buff *skb;
  13746. +
  13747. + while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {
  13748. + u32 old_rcv_nxt = meta_tp->rcv_nxt;
  13749. + if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))
  13750. + break;
  13751. +
  13752. + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {
  13753. + __skb_unlink(skb, &meta_tp->out_of_order_queue);
  13754. + mptcp_remove_shortcuts(meta_tp->mpcb, skb);
  13755. + __kfree_skb(skb);
  13756. + continue;
  13757. + }
  13758. +
  13759. + __skb_unlink(skb, &meta_tp->out_of_order_queue);
  13760. + mptcp_remove_shortcuts(meta_tp->mpcb, skb);
  13761. +
  13762. + __skb_queue_tail(&meta_sk->sk_receive_queue, skb);
  13763. + meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
  13764. + mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
  13765. +
  13766. + if (tcp_hdr(skb)->fin)
  13767. + mptcp_fin(meta_sk);
  13768. + }
  13769. +}
  13770. +
  13771. +void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)
  13772. +{
  13773. + struct sk_buff_head *head = &meta_tp->out_of_order_queue;
  13774. + struct sk_buff *skb, *tmp;
  13775. +
  13776. + skb_queue_walk_safe(head, skb, tmp) {
  13777. + __skb_unlink(skb, head);
  13778. + mptcp_remove_shortcuts(meta_tp->mpcb, skb);
  13779. + kfree_skb(skb);
  13780. + }
  13781. +}
  13782. diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c
  13783. new file mode 100644
  13784. index 0000000..43d821e
  13785. --- /dev/null
  13786. +++ b/net/mptcp/mptcp_olia.c
  13787. @@ -0,0 +1,314 @@
  13788. +/*
  13789. + * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
  13790. + *
  13791. + * Algorithm design:
  13792. + * Ramin Khalili <ramin.khalili@epfl.ch>
  13793. + * Nicolas Gast <nicolas.gast@epfl.ch>
  13794. + * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
  13795. + *
  13796. + * Implementation:
  13797. + * Ramin Khalili <ramin.khalili@epfl.ch>
  13798. + *
  13799. + * Ported to the official MPTCP-kernel:
  13800. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  13801. + *
  13802. + * This program is free software; you can redistribute it and/or
  13803. + * modify it under the terms of the GNU General Public License
  13804. + * as published by the Free Software Foundation; either version
  13805. + * 2 of the License, or (at your option) any later version.
  13806. + */
  13807. +
  13808. +
  13809. +#include <net/tcp.h>
  13810. +#include <net/mptcp.h>
  13811. +
  13812. +#include <linux/module.h>
  13813. +
  13814. +static int scale = 10;
  13815. +
  13816. +struct mptcp_olia {
  13817. + u32 mptcp_loss1;
  13818. + u32 mptcp_loss2;
  13819. + u32 mptcp_loss3;
  13820. + int epsilon_num;
  13821. + u32 epsilon_den;
  13822. + int mptcp_snd_cwnd_cnt;
  13823. +};
  13824. +
  13825. +static inline int mptcp_olia_sk_can_send(const struct sock *sk)
  13826. +{
  13827. + return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt;
  13828. +}
  13829. +
  13830. +static inline u64 mptcp_olia_scale(u64 val, int scale)
  13831. +{
  13832. + return (u64) val << scale;
  13833. +}
  13834. +
  13835. +/* take care of artificially inflate (see RFC5681)
  13836. + * of cwnd during fast-retransmit phase
  13837. + */
  13838. +static u32 mptcp_get_crt_cwnd(struct sock *sk)
  13839. +{
  13840. + struct inet_connection_sock *icsk = inet_csk(sk);
  13841. +
  13842. + if (icsk->icsk_ca_state == TCP_CA_Recovery)
  13843. + return tcp_sk(sk)->snd_ssthresh;
  13844. + else
  13845. + return tcp_sk(sk)->snd_cwnd;
  13846. +}
  13847. +
  13848. +/* return the dominator of the first term of the increasing term */
  13849. +static u64 mptcp_get_rate(struct mptcp_cb *mpcb , u32 path_rtt)
  13850. +{
  13851. + struct sock *sk;
  13852. + u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
  13853. +
  13854. + mptcp_for_each_sk(mpcb, sk) {
  13855. + struct tcp_sock *tp = tcp_sk(sk);
  13856. + u64 scaled_num;
  13857. + u32 tmp_cwnd;
  13858. +
  13859. + if (!mptcp_olia_sk_can_send(sk))
  13860. + continue;
  13861. +
  13862. + tmp_cwnd = mptcp_get_crt_cwnd(sk);
  13863. + scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
  13864. + rate += div_u64(scaled_num , tp->srtt);
  13865. + }
  13866. + rate *= rate;
  13867. + return rate;
  13868. +}
  13869. +
  13870. +/* find the maximum cwnd, used to find set M */
  13871. +static u32 mptcp_get_max_cwnd(struct mptcp_cb *mpcb)
  13872. +{
  13873. + struct sock *sk;
  13874. + u32 best_cwnd = 0;
  13875. +
  13876. + mptcp_for_each_sk(mpcb, sk) {
  13877. + u32 tmp_cwnd;
  13878. +
  13879. + if (!mptcp_olia_sk_can_send(sk))
  13880. + continue;
  13881. +
  13882. + tmp_cwnd = mptcp_get_crt_cwnd(sk);
  13883. + if (tmp_cwnd > best_cwnd)
  13884. + best_cwnd = tmp_cwnd;
  13885. + }
  13886. + return best_cwnd;
  13887. +}
  13888. +
  13889. +static void mptcp_get_epsilon(struct mptcp_cb *mpcb)
  13890. +{
  13891. + struct mptcp_olia *ca;
  13892. + struct tcp_sock *tp;
  13893. + struct sock *sk;
  13894. + u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
  13895. + u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;
  13896. + u8 M = 0, B_not_M = 0;
  13897. +
  13898. + /* TODO - integrate this in the following loop - we just want to iterate once */
  13899. +
  13900. + max_cwnd = mptcp_get_max_cwnd(mpcb);
  13901. +
  13902. + /* find the best path */
  13903. + mptcp_for_each_sk(mpcb, sk) {
  13904. + tp = tcp_sk(sk);
  13905. + ca = inet_csk_ca(sk);
  13906. +
  13907. + if (!mptcp_olia_sk_can_send(sk))
  13908. + continue;
  13909. +
  13910. + tmp_rtt = tp->srtt * tp->srtt;
  13911. + /* TODO - check here and rename variables */
  13912. + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
  13913. + ca->mptcp_loss2 - ca->mptcp_loss1);
  13914. +
  13915. + tmp_cwnd = mptcp_get_crt_cwnd(sk);
  13916. + if (tmp_int * best_rtt >= best_int * tmp_rtt) {
  13917. + best_rtt = tmp_rtt;
  13918. + best_int = tmp_int;
  13919. + best_cwnd = tmp_cwnd;
  13920. + }
  13921. + }
  13922. +
  13923. + /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
  13924. + /* find the size of M and B_not_M */
  13925. + mptcp_for_each_sk(mpcb, sk) {
  13926. + tp = tcp_sk(sk);
  13927. + ca = inet_csk_ca(sk);
  13928. +
  13929. + if (!mptcp_olia_sk_can_send(sk))
  13930. + continue;
  13931. +
  13932. + tmp_cwnd = mptcp_get_crt_cwnd(sk);
  13933. + if (tmp_cwnd == max_cwnd) {
  13934. + M++;
  13935. + } else {
  13936. + tmp_rtt = tp->srtt * tp->srtt;
  13937. + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
  13938. + ca->mptcp_loss2 - ca->mptcp_loss1);
  13939. +
  13940. + if (tmp_int * best_rtt == best_int * tmp_rtt)
  13941. + B_not_M++;
  13942. + }
  13943. + }
  13944. +
  13945. + /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
  13946. + mptcp_for_each_sk(mpcb, sk) {
  13947. + tp = tcp_sk(sk);
  13948. + ca = inet_csk_ca(sk);
  13949. +
  13950. + if (!mptcp_olia_sk_can_send(sk))
  13951. + continue;
  13952. +
  13953. + if (B_not_M == 0) {
  13954. + ca->epsilon_num = 0;
  13955. + ca->epsilon_den = 1;
  13956. + } else {
  13957. + tmp_rtt = tp->srtt * tp->srtt;
  13958. + tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
  13959. + ca->mptcp_loss2 - ca->mptcp_loss1);
  13960. + tmp_cwnd = mptcp_get_crt_cwnd(sk);
  13961. +
  13962. + if (tmp_cwnd < max_cwnd &&
  13963. + tmp_int * best_rtt == best_int * tmp_rtt){
  13964. + ca->epsilon_num = 1;
  13965. + ca->epsilon_den = mpcb->cnt_established * B_not_M;
  13966. + } else if (tmp_cwnd == max_cwnd) {
  13967. + ca->epsilon_num = -1;
  13968. + ca->epsilon_den = mpcb->cnt_established * M;
  13969. + } else {
  13970. + ca->epsilon_num = 0;
  13971. + ca->epsilon_den = 1;
  13972. + }
  13973. + }
  13974. + }
  13975. +
  13976. +}
  13977. +
  13978. +/* setting the initial values */
  13979. +static void mptcp_olia_init(struct sock *sk)
  13980. +{
  13981. + struct tcp_sock *tp = tcp_sk(sk);
  13982. + struct mptcp_olia *ca = inet_csk_ca(sk);
  13983. +
  13984. + if (tp->mpc) {
  13985. + ca->mptcp_loss1 = tp->snd_una;
  13986. + ca->mptcp_loss2 = tp->snd_una;
  13987. + ca->mptcp_loss3 = tp->snd_una;
  13988. + ca->mptcp_snd_cwnd_cnt = 0;
  13989. + ca->epsilon_num = 0;
  13990. + ca->epsilon_den = 1;
  13991. + }
  13992. +}
  13993. +
  13994. +/* updating inter-loss distance and ssthresh */
  13995. +static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
  13996. +{
  13997. + if (!tcp_sk(sk)->mpc)
  13998. + return;
  13999. +
  14000. + if (new_state == TCP_CA_Loss ||
  14001. + new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
  14002. + struct mptcp_olia *ca = inet_csk_ca(sk);
  14003. +
  14004. + if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
  14005. + !inet_csk(sk)->icsk_retransmits) {
  14006. + ca->mptcp_loss1 = ca->mptcp_loss2;
  14007. + ca->mptcp_loss2 = ca->mptcp_loss3;
  14008. + }
  14009. + }
  14010. +
  14011. +}
  14012. +
  14013. +/* main algorithm */
  14014. +static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
  14015. +{
  14016. + struct tcp_sock *tp = tcp_sk(sk);
  14017. + struct mptcp_olia *ca = inet_csk_ca(sk);
  14018. + struct mptcp_cb *mpcb = tp->mpcb;
  14019. +
  14020. + u64 inc_num, inc_den, rate, cwnd_scaled;
  14021. +
  14022. + if (!tp->mpc) {
  14023. + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
  14024. + return;
  14025. + }
  14026. +
  14027. + ca->mptcp_loss3 = tp->snd_una;
  14028. +
  14029. + if (!tcp_is_cwnd_limited(sk, in_flight))
  14030. + return;
  14031. +
  14032. + /* slow start if it is in the safe area */
  14033. + if (tp->snd_cwnd <= tp->snd_ssthresh) {
  14034. + tcp_slow_start(tp, acked);
  14035. + return;
  14036. + }
  14037. +
  14038. + mptcp_get_epsilon(mpcb);
  14039. + rate = mptcp_get_rate(mpcb, tp->srtt);
  14040. + cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
  14041. + inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
  14042. +
  14043. + /* calculate the increasing term, scaling is used to reduce the rounding effect */
  14044. + if (ca->epsilon_num == -1) {
  14045. + if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
  14046. + inc_num = rate - ca->epsilon_den *
  14047. + cwnd_scaled * cwnd_scaled;
  14048. + ca->mptcp_snd_cwnd_cnt -= div64_u64(
  14049. + mptcp_olia_scale(inc_num , scale) , inc_den);
  14050. + } else {
  14051. + inc_num = ca->epsilon_den *
  14052. + cwnd_scaled * cwnd_scaled - rate;
  14053. + ca->mptcp_snd_cwnd_cnt += div64_u64(
  14054. + mptcp_olia_scale(inc_num , scale) , inc_den);
  14055. + }
  14056. + } else {
  14057. + inc_num = ca->epsilon_num * rate +
  14058. + ca->epsilon_den * cwnd_scaled * cwnd_scaled;
  14059. + ca->mptcp_snd_cwnd_cnt += div64_u64(
  14060. + mptcp_olia_scale(inc_num , scale) , inc_den);
  14061. + }
  14062. +
  14063. +
  14064. + if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
  14065. + if (tp->snd_cwnd < tp->snd_cwnd_clamp)
  14066. + tp->snd_cwnd++;
  14067. + ca->mptcp_snd_cwnd_cnt = 0;
  14068. + } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
  14069. + tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
  14070. + ca->mptcp_snd_cwnd_cnt = 0;
  14071. + }
  14072. +}
  14073. +
  14074. +static struct tcp_congestion_ops mptcp_olia = {
  14075. + .init = mptcp_olia_init,
  14076. + .ssthresh = tcp_reno_ssthresh,
  14077. + .cong_avoid = mptcp_olia_cong_avoid,
  14078. + .set_state = mptcp_olia_set_state,
  14079. + .min_cwnd = tcp_reno_min_cwnd,
  14080. + .owner = THIS_MODULE,
  14081. + .name = "olia",
  14082. +};
  14083. +
  14084. +static int __init mptcp_olia_register(void)
  14085. +{
  14086. + BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
  14087. + return tcp_register_congestion_control(&mptcp_olia);
  14088. +}
  14089. +
  14090. +static void __exit mptcp_olia_unregister(void)
  14091. +{
  14092. + tcp_unregister_congestion_control(&mptcp_olia);
  14093. +}
  14094. +
  14095. +module_init(mptcp_olia_register);
  14096. +module_exit(mptcp_olia_unregister);
  14097. +
  14098. +MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
  14099. +MODULE_LICENSE("GPL");
  14100. +MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
  14101. +MODULE_VERSION("0.1");
  14102. diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
  14103. new file mode 100644
  14104. index 0000000..807b79e
  14105. --- /dev/null
  14106. +++ b/net/mptcp/mptcp_output.c
  14107. @@ -0,0 +1,2255 @@
  14108. +/*
  14109. + * MPTCP implementation - Sending side
  14110. + *
  14111. + * Initial Design & Implementation:
  14112. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  14113. + *
  14114. + * Current Maintainer & Author:
  14115. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  14116. + *
  14117. + * Additional authors:
  14118. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  14119. + * Gregory Detal <gregory.detal@uclouvain.be>
  14120. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  14121. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  14122. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  14123. + * Andreas Ripke <ripke@neclab.eu>
  14124. + * Vlad Dogaru <vlad.dogaru@intel.com>
  14125. + * Octavian Purdila <octavian.purdila@intel.com>
  14126. + * John Ronan <jronan@tssg.org>
  14127. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  14128. + * Brandon Heller <brandonh@stanford.edu>
  14129. + *
  14130. + *
  14131. + * This program is free software; you can redistribute it and/or
  14132. + * modify it under the terms of the GNU General Public License
  14133. + * as published by the Free Software Foundation; either version
  14134. + * 2 of the License, or (at your option) any later version.
  14135. + */
  14136. +
  14137. +#include <linux/kconfig.h>
  14138. +#include <linux/skbuff.h>
  14139. +#include <linux/tcp.h>
  14140. +
  14141. +#include <net/mptcp.h>
  14142. +#include <net/mptcp_v4.h>
  14143. +#include <net/mptcp_v6.h>
  14144. +#include <net/sock.h>
  14145. +
  14146. +static inline int mptcp_pi_to_flag(int pi)
  14147. +{
  14148. + return 1 << (pi - 1);
  14149. +}
  14150. +
  14151. +static inline int mptcp_sub_len_remove_addr(u16 bitfield)
  14152. +{
  14153. + unsigned int c;
  14154. + for (c = 0; bitfield; c++)
  14155. + bitfield &= bitfield - 1;
  14156. + return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
  14157. +}
  14158. +
  14159. +int mptcp_sub_len_remove_addr_align(u16 bitfield)
  14160. +{
  14161. + return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
  14162. +}
  14163. +EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
  14164. +
  14165. +/* If the sub-socket sk available to send the skb? */
  14166. +static int mptcp_is_available(struct sock *sk, struct sk_buff *skb,
  14167. + unsigned int *mss)
  14168. +{
  14169. + struct tcp_sock *tp = tcp_sk(sk);
  14170. + unsigned int mss_now;
  14171. +
  14172. + /* Set of states for which we are allowed to send data */
  14173. + if (!mptcp_sk_can_send(sk))
  14174. + return 0;
  14175. +
  14176. + /* We do not send data on this subflow unless it is
  14177. + * fully established, i.e. the 4th ack has been received.
  14178. + */
  14179. + if (tp->mptcp->pre_established)
  14180. + return 0;
  14181. +
  14182. + if (tp->pf ||
  14183. + (tp->mpcb->noneligible & mptcp_pi_to_flag(tp->mptcp->path_index)))
  14184. + return 0;
  14185. +
  14186. + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
  14187. + /* If SACK is disabled, and we got a loss, TCP does not exit
  14188. + * the loss-state until something above high_seq has been acked.
  14189. + * (see tcp_try_undo_recovery)
  14190. + *
  14191. + * high_seq is the snd_nxt at the moment of the RTO. As soon
  14192. + * as we have an RTO, we won't push data on the subflow.
  14193. + * Thus, snd_una can never go beyond high_seq.
  14194. + */
  14195. + if (!tcp_is_reno(tp))
  14196. + return 0;
  14197. + else if (tp->snd_una != tp->high_seq)
  14198. + return 0;
  14199. + }
  14200. +
  14201. + if (!tp->mptcp->fully_established) {
  14202. + /* Make sure that we send in-order data */
  14203. + if (skb && tp->mptcp->second_packet &&
  14204. + tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
  14205. + return 0;
  14206. + }
  14207. +
  14208. + if (!tcp_cwnd_test(tp, skb))
  14209. + return 0;
  14210. +
  14211. + mss_now = tcp_current_mss(sk);
  14212. + /* Don't send on this subflow if we bypass the allowed send-window at
  14213. + * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
  14214. + * calculated end_seq (because here at this point end_seq is still at
  14215. + * the meta-level).
  14216. + */
  14217. + if (skb && after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
  14218. + return 0;
  14219. +
  14220. + if (mss)
  14221. + *mss = mss_now;
  14222. +
  14223. + return 1;
  14224. +}
  14225. +
  14226. +/* Are we not allowed to reinject this skb on tp? */
  14227. +static int mptcp_dont_reinject_skb(struct tcp_sock *tp, struct sk_buff *skb)
  14228. +{
  14229. + /* If the skb has already been enqueued in this sk, try to find
  14230. + * another one.
  14231. + */
  14232. + return skb &&
  14233. + /* Has the skb already been enqueued into this subsocket? */
  14234. + mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
  14235. +}
  14236. +
  14237. +/* This is the scheduler. This function decides on which flow to send
  14238. + * a given MSS. If all subflows are found to be busy, NULL is returned
  14239. + * The flow is selected based on the shortest RTT.
  14240. + * If all paths have full cong windows, we simply return NULL.
  14241. + *
  14242. + * Additionally, this function is aware of the backup-subflows.
  14243. + */
  14244. +static struct sock *get_available_subflow(struct sock *meta_sk,
  14245. + struct sk_buff *skb,
  14246. + unsigned int *mss_now)
  14247. +{
  14248. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  14249. + struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;
  14250. + unsigned int mss = 0, mss_lowprio = 0, mss_backup = 0;
  14251. + u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;
  14252. + int cnt_backups = 0;
  14253. +
  14254. + /* if there is only one subflow, bypass the scheduling function */
  14255. + if (mpcb->cnt_subflows == 1) {
  14256. + bestsk = (struct sock *)mpcb->connection_list;
  14257. + if (!mptcp_is_available(bestsk, skb, mss_now))
  14258. + bestsk = NULL;
  14259. + return bestsk;
  14260. + }
  14261. +
  14262. + /* Answer data_fin on same subflow!!! */
  14263. + if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
  14264. + skb && mptcp_is_data_fin(skb)) {
  14265. + mptcp_for_each_sk(mpcb, sk) {
  14266. + if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
  14267. + mptcp_is_available(sk, skb, mss_now))
  14268. + return sk;
  14269. + }
  14270. + }
  14271. +
  14272. + /* First, find the best subflow */
  14273. + mptcp_for_each_sk(mpcb, sk) {
  14274. + struct tcp_sock *tp = tcp_sk(sk);
  14275. + int this_mss;
  14276. +
  14277. + if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)
  14278. + cnt_backups++;
  14279. +
  14280. + if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
  14281. + tp->srtt < lowprio_min_time_to_peer) {
  14282. +
  14283. + if (!mptcp_is_available(sk, skb, &this_mss))
  14284. + continue;
  14285. +
  14286. + if (mptcp_dont_reinject_skb(tp, skb)) {
  14287. + mss_backup = this_mss;
  14288. + backupsk = sk;
  14289. + continue;
  14290. + }
  14291. +
  14292. + lowprio_min_time_to_peer = tp->srtt;
  14293. + lowpriosk = sk;
  14294. + mss_lowprio = this_mss;
  14295. + } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
  14296. + tp->srtt < min_time_to_peer) {
  14297. + if (!mptcp_is_available(sk, skb, &this_mss))
  14298. + continue;
  14299. +
  14300. + if (mptcp_dont_reinject_skb(tp, skb)) {
  14301. + mss_backup = this_mss;
  14302. + backupsk = sk;
  14303. + continue;
  14304. + }
  14305. +
  14306. + min_time_to_peer = tp->srtt;
  14307. + bestsk = sk;
  14308. + mss = this_mss;
  14309. + }
  14310. + }
  14311. +
  14312. + if (mpcb->cnt_established == cnt_backups && lowpriosk) {
  14313. + mss = mss_lowprio;
  14314. + sk = lowpriosk;
  14315. + } else if (bestsk) {
  14316. + sk = bestsk;
  14317. + } else if (backupsk){
  14318. + /* It has been sent on all subflows once - let's give it a
  14319. + * chance again by restarting its pathmask.
  14320. + */
  14321. + if (skb)
  14322. + TCP_SKB_CB(skb)->path_mask = 0;
  14323. + mss = mss_backup;
  14324. + sk = backupsk;
  14325. + }
  14326. +
  14327. + if (mss_now)
  14328. + *mss_now = mss;
  14329. +
  14330. + return sk;
  14331. +}
  14332. +
  14333. +static struct mp_dss *mptcp_skb_find_dss(const struct sk_buff *skb)
  14334. +{
  14335. + if (!mptcp_is_data_seq(skb))
  14336. + return NULL;
  14337. +
  14338. + return (struct mp_dss *)(skb->data - (MPTCP_SUB_LEN_DSS_ALIGN +
  14339. + MPTCP_SUB_LEN_ACK_ALIGN +
  14340. + MPTCP_SUB_LEN_SEQ_ALIGN));
  14341. +}
  14342. +
  14343. +/* get the data-seq and end-data-seq and store them again in the
  14344. + * tcp_skb_cb
  14345. + */
  14346. +static int mptcp_reconstruct_mapping(struct sk_buff *skb, struct sk_buff *orig_skb)
  14347. +{
  14348. + struct mp_dss *mpdss = mptcp_skb_find_dss(orig_skb);
  14349. + u32 *p32;
  14350. + u16 *p16;
  14351. +
  14352. + if (!mpdss || !mpdss->M)
  14353. + return 1;
  14354. +
  14355. + /* Move the pointer to the data-seq */
  14356. + p32 = (u32 *)mpdss;
  14357. + p32++;
  14358. + if (mpdss->A) {
  14359. + p32++;
  14360. + if (mpdss->a)
  14361. + p32++;
  14362. + }
  14363. +
  14364. + TCP_SKB_CB(skb)->seq = ntohl(*p32);
  14365. +
  14366. + /* Get the data_len to calculate the end_data_seq */
  14367. + p32++;
  14368. + p32++;
  14369. + p16 = (u16 *)p32;
  14370. + TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
  14371. +
  14372. + return 0;
  14373. +}
  14374. +
  14375. +/* Similar to __pskb_copy and sk_stream_alloc_skb. */
  14376. +static struct sk_buff *mptcp_pskb_copy(struct sk_buff *skb)
  14377. +{
  14378. + struct sk_buff *n;
  14379. + /* The TCP header must be at least 32-bit aligned. */
  14380. + int size = ALIGN(skb_headlen(skb), 4);
  14381. +
  14382. + n = alloc_skb_fclone(size + MAX_TCP_HEADER, GFP_ATOMIC);
  14383. + if (!n)
  14384. + return NULL;
  14385. +
  14386. + /* Set the data pointer */
  14387. + skb_reserve(n, MAX_TCP_HEADER);
  14388. + /* Set the tail pointer and length */
  14389. + skb_put(n, skb_headlen(skb));
  14390. + /* Copy the bytes */
  14391. + skb_copy_from_linear_data(skb, n->data, n->len);
  14392. +
  14393. + n->truesize += skb->data_len;
  14394. + n->data_len = skb->data_len;
  14395. + n->len = skb->len;
  14396. +
  14397. + if (skb_shinfo(skb)->nr_frags) {
  14398. + int i;
  14399. +
  14400. + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
  14401. + if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
  14402. + kfree_skb(n);
  14403. + n = NULL;
  14404. + goto out;
  14405. + }
  14406. + }
  14407. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  14408. + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
  14409. + skb_frag_ref(skb, i);
  14410. + }
  14411. + skb_shinfo(n)->nr_frags = i;
  14412. + }
  14413. +
  14414. + if (skb_has_frag_list(skb)) {
  14415. + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
  14416. + skb_clone_fraglist(n);
  14417. + }
  14418. +
  14419. + copy_skb_header(n, skb);
  14420. +out:
  14421. + return n;
  14422. +}
  14423. +
  14424. +/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
  14425. + * coming from the meta-retransmit-timer
  14426. + */
  14427. +static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
  14428. + struct sock *sk, int clone_it)
  14429. +{
  14430. + struct sk_buff *skb, *skb1;
  14431. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  14432. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  14433. + u32 seq, end_seq;
  14434. +
  14435. + if (clone_it) {
  14436. + /* pskb_copy is necessary here, because the TCP/IP-headers
  14437. + * will be changed when it's going to be reinjected on another
  14438. + * subflow.
  14439. + */
  14440. + skb = mptcp_pskb_copy(orig_skb);
  14441. + } else {
  14442. + __skb_unlink(orig_skb, &sk->sk_write_queue);
  14443. + sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
  14444. + sk->sk_wmem_queued -= orig_skb->truesize;
  14445. + sk_mem_uncharge(sk, orig_skb->truesize);
  14446. + skb = orig_skb;
  14447. + }
  14448. + if (unlikely(!skb))
  14449. + return;
  14450. +
  14451. + if (sk && mptcp_reconstruct_mapping(skb, orig_skb)) {
  14452. + __kfree_skb(skb);
  14453. + return;
  14454. + }
  14455. +
  14456. + skb->sk = meta_sk;
  14457. +
  14458. + /* If it reached already the destination, we don't have to reinject it */
  14459. + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
  14460. + __kfree_skb(skb);
  14461. + return;
  14462. + }
  14463. +
  14464. + /* Only reinject segments that are fully covered by the mapping */
  14465. + if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
  14466. + TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
  14467. + u32 seq = TCP_SKB_CB(skb)->seq;
  14468. + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  14469. +
  14470. + __kfree_skb(skb);
  14471. +
  14472. + /* Ok, now we have to look for the full mapping in the meta
  14473. + * send-queue :S
  14474. + */
  14475. + tcp_for_write_queue(skb, meta_sk) {
  14476. + /* Not yet at the mapping? */
  14477. + if (before(TCP_SKB_CB(skb)->seq, seq))
  14478. + continue;
  14479. + /* We have passed by the mapping */
  14480. + if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
  14481. + return;
  14482. +
  14483. + __mptcp_reinject_data(skb, meta_sk, NULL, 1);
  14484. + }
  14485. + return;
  14486. + }
  14487. +
  14488. + /* If it's empty, just add */
  14489. + if (skb_queue_empty(&mpcb->reinject_queue)) {
  14490. + skb_queue_head(&mpcb->reinject_queue, skb);
  14491. + return;
  14492. + }
  14493. +
  14494. + /* Find place to insert skb - or even we can 'drop' it, as the
  14495. + * data is already covered by other skb's in the reinject-queue.
  14496. + *
  14497. + * This is inspired by code from tcp_data_queue.
  14498. + */
  14499. +
  14500. + skb1 = skb_peek_tail(&mpcb->reinject_queue);
  14501. + seq = TCP_SKB_CB(skb)->seq;
  14502. + while (1) {
  14503. + if (!after(TCP_SKB_CB(skb1)->seq, seq))
  14504. + break;
  14505. + if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
  14506. + skb1 = NULL;
  14507. + break;
  14508. + }
  14509. + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
  14510. + }
  14511. +
  14512. + /* Do skb overlap to previous one? */
  14513. + end_seq = TCP_SKB_CB(skb)->end_seq;
  14514. + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
  14515. + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
  14516. + /* All the bits are present. Don't reinject */
  14517. + __kfree_skb(skb);
  14518. + return;
  14519. + }
  14520. + if (seq == TCP_SKB_CB(skb1)->seq) {
  14521. + if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
  14522. + skb1 = NULL;
  14523. + else
  14524. + skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
  14525. + }
  14526. + }
  14527. + if (!skb1)
  14528. + __skb_queue_head(&mpcb->reinject_queue, skb);
  14529. + else
  14530. + __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
  14531. +
  14532. + /* And clean segments covered by new one as whole. */
  14533. + while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
  14534. + skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
  14535. +
  14536. + if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
  14537. + break;
  14538. +
  14539. + __skb_unlink(skb1, &mpcb->reinject_queue);
  14540. + __kfree_skb(skb1);
  14541. + }
  14542. + return;
  14543. +}
  14544. +
  14545. +/* Inserts data into the reinject queue */
  14546. +void mptcp_reinject_data(struct sock *sk, int clone_it)
  14547. +{
  14548. + struct sk_buff *skb_it, *tmp;
  14549. + struct tcp_sock *tp = tcp_sk(sk);
  14550. + struct sock *meta_sk = tp->meta_sk;
  14551. +
  14552. + /* It has already been closed - there is really no point in reinjecting */
  14553. + if (meta_sk->sk_state == TCP_CLOSE)
  14554. + return;
  14555. +
  14556. + skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
  14557. + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
  14558. + /* Subflow syn's and fin's are not reinjected.
  14559. + *
  14560. + * As well as empty subflow-fins with a data-fin.
  14561. + * They are reinjected below (without the subflow-fin-flag)
  14562. + */
  14563. + if (tcb->tcp_flags & TCPHDR_SYN ||
  14564. + (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
  14565. + (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
  14566. + continue;
  14567. +
  14568. + __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
  14569. + }
  14570. +
  14571. + skb_it = tcp_write_queue_tail(meta_sk);
  14572. + /* If sk has sent the empty data-fin, we have to reinject it too. */
  14573. + if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
  14574. + TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
  14575. + __mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
  14576. + }
  14577. +
  14578. + mptcp_push_pending_frames(meta_sk);
  14579. +
  14580. + tp->pf = 1;
  14581. +}
  14582. +EXPORT_SYMBOL(mptcp_reinject_data);
  14583. +
  14584. +static void mptcp_combine_dfin(struct sk_buff *skb, struct sock *meta_sk,
  14585. + struct sock *subsk)
  14586. +{
  14587. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  14588. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  14589. + struct sock *sk_it;
  14590. + int all_empty = 1, all_acked;
  14591. +
  14592. + /* In infinite mapping we always try to combine */
  14593. + if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {
  14594. + subsk->sk_shutdown |= SEND_SHUTDOWN;
  14595. + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
  14596. + return;
  14597. + }
  14598. +
  14599. + /* Don't combine, if they didn't combine - otherwise we end up in
  14600. + * TIME_WAIT, even if our app is smart enough to avoid it
  14601. + */
  14602. + if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
  14603. + if (!mpcb->dfin_combined)
  14604. + return;
  14605. + }
  14606. +
  14607. + /* If no other subflow has data to send, we can combine */
  14608. + mptcp_for_each_sk(mpcb, sk_it) {
  14609. + if (!mptcp_sk_can_send(sk_it))
  14610. + continue;
  14611. +
  14612. + if (!tcp_write_queue_empty(sk_it))
  14613. + all_empty = 0;
  14614. + }
  14615. +
  14616. + /* If all data has been DATA_ACKed, we can combine.
  14617. + * -1, because the data_fin consumed one byte
  14618. + */
  14619. + all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));
  14620. +
  14621. + if ((all_empty || all_acked) && tcp_close_state(subsk)) {
  14622. + subsk->sk_shutdown |= SEND_SHUTDOWN;
  14623. + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
  14624. + }
  14625. +}
  14626. +
  14627. +static struct sk_buff *mptcp_skb_entail(struct sock *sk, struct sk_buff *skb,
  14628. + int reinject)
  14629. +{
  14630. + __be32 *ptr;
  14631. + __u16 data_len;
  14632. + struct mp_dss *mdss;
  14633. + struct tcp_sock *tp = tcp_sk(sk);
  14634. + struct sock *meta_sk = mptcp_meta_sk(sk);
  14635. + struct mptcp_cb *mpcb = tp->mpcb;
  14636. + struct tcp_skb_cb *tcb;
  14637. + struct sk_buff *subskb = NULL;
  14638. +
  14639. + if (!reinject)
  14640. + TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
  14641. + MPTCPHDR_SEQ64_INDEX : 0);
  14642. +
  14643. + subskb = mptcp_pskb_copy(skb);
  14644. + if (!subskb)
  14645. + return NULL;
  14646. +
  14647. + TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
  14648. +
  14649. + if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&
  14650. + skb->ip_summed == CHECKSUM_PARTIAL) {
  14651. + subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
  14652. + subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
  14653. + }
  14654. +
  14655. + /* The subskb is going in the subflow send-queue. Its path-mask
  14656. + * is not needed anymore and MUST be set to 0, as the path-mask
  14657. + * is a union with inet_skb_param.
  14658. + */
  14659. + tcb = TCP_SKB_CB(subskb);
  14660. + tcb->path_mask = 0;
  14661. +
  14662. + if (mptcp_is_data_fin(subskb))
  14663. + mptcp_combine_dfin(subskb, meta_sk, sk);
  14664. +
  14665. + if (tp->mpcb->infinite_mapping_snd)
  14666. + goto no_data_seq;
  14667. +
  14668. + if (tp->mpcb->send_infinite_mapping &&
  14669. + !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
  14670. + tp->mptcp->fully_established = 1;
  14671. + tp->mpcb->infinite_mapping_snd = 1;
  14672. + tp->mptcp->infinite_cutoff_seq = tp->write_seq;
  14673. + tcb->mptcp_flags |= MPTCPHDR_INF;
  14674. + data_len = 0;
  14675. + } else {
  14676. + data_len = tcb->end_seq - tcb->seq;
  14677. + }
  14678. +
  14679. + /**** Write MPTCP DSS-option to the packet. ****/
  14680. + ptr = (__be32 *)(subskb->data - (MPTCP_SUB_LEN_DSS_ALIGN +
  14681. + MPTCP_SUB_LEN_ACK_ALIGN +
  14682. + MPTCP_SUB_LEN_SEQ_ALIGN));
  14683. +
  14684. + /* Then we start writing it from the start */
  14685. + mdss = (struct mp_dss *)ptr;
  14686. +
  14687. + mdss->kind = TCPOPT_MPTCP;
  14688. + mdss->sub = MPTCP_SUB_DSS;
  14689. + mdss->rsv1 = 0;
  14690. + mdss->rsv2 = 0;
  14691. + mdss->F = (mptcp_is_data_fin(subskb) ? 1 : 0);
  14692. + mdss->m = 0;
  14693. + mdss->M = 1;
  14694. + mdss->a = 0;
  14695. + mdss->A = 1;
  14696. + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
  14697. +
  14698. + ptr++;
  14699. + ptr++; /* data_ack will be set in mptcp_options_write */
  14700. + *ptr++ = htonl(tcb->seq); /* data_seq */
  14701. +
  14702. + /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
  14703. + if (mptcp_is_data_fin(subskb) && subskb->len == 0)
  14704. + *ptr++ = 0; /* subseq */
  14705. + else
  14706. + *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
  14707. +
  14708. + if (tp->mpcb->dss_csum && data_len) {
  14709. + __be16 *p16 = (__be16 *)ptr;
  14710. + __be32 hdseq = mptcp_get_highorder_sndbits(subskb, tp->mpcb);
  14711. + __wsum csum;
  14712. + *ptr = htonl(((data_len) << 16) |
  14713. + (TCPOPT_EOL << 8) |
  14714. + (TCPOPT_EOL));
  14715. +
  14716. + csum = csum_partial(ptr - 2, 12, subskb->csum);
  14717. + p16++;
  14718. + *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
  14719. + } else {
  14720. + *ptr++ = htonl(((data_len) << 16) |
  14721. + (TCPOPT_NOP << 8) |
  14722. + (TCPOPT_NOP));
  14723. + }
  14724. +
  14725. +no_data_seq:
  14726. + tcb->seq = tp->write_seq;
  14727. + tcb->sacked = 0; /* reset the sacked field: from the point of view
  14728. + * of this subflow, we are sending a brand new
  14729. + * segment */
  14730. + /* Take into account seg len */
  14731. + tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
  14732. + tcb->end_seq = tp->write_seq;
  14733. +
  14734. + /* If it's a non-payload DATA_FIN (also no subflow-fin), the
  14735. + * segment is not part of the subflow but on a meta-only-level
  14736. + */
  14737. + if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
  14738. + tcp_add_write_queue_tail(sk, subskb);
  14739. + sk->sk_wmem_queued += subskb->truesize;
  14740. + sk_mem_charge(sk, subskb->truesize);
  14741. + }
  14742. +
  14743. + return subskb;
  14744. +}
  14745. +
  14746. +static void mptcp_sub_event_new_data_sent(struct sock *sk,
  14747. + struct sk_buff *subskb,
  14748. + struct sk_buff *skb)
  14749. +{
  14750. + /* If it's a non-payload DATA_FIN (also no subflow-fin), the
  14751. + * segment is not part of the subflow but on a meta-only-level
  14752. + *
  14753. + * We free it, because it has been queued nowhere.
  14754. + */
  14755. + if (!mptcp_is_data_fin(subskb) ||
  14756. + (TCP_SKB_CB(subskb)->end_seq != TCP_SKB_CB(subskb)->seq)) {
  14757. + tcp_event_new_data_sent(sk, subskb);
  14758. + tcp_sk(sk)->mptcp->second_packet = 1;
  14759. + tcp_sk(sk)->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
  14760. + } else {
  14761. + kfree_skb(subskb);
  14762. + }
  14763. +}
  14764. +
  14765. +/* Handle the packets and sockets after a tcp_transmit_skb failed */
  14766. +static void mptcp_transmit_skb_failed(struct sock *sk, struct sk_buff *skb,
  14767. + struct sk_buff *subskb)
  14768. +{
  14769. + struct tcp_sock *tp = tcp_sk(sk);
  14770. + struct mptcp_cb *mpcb = tp->mpcb;
  14771. +
  14772. + /* No work to do if we are in infinite mapping mode
  14773. + * There is only one subflow left and we cannot send this segment on
  14774. + * another subflow.
  14775. + */
  14776. + if (mpcb->infinite_mapping_snd)
  14777. + return;
  14778. +
  14779. + TCP_SKB_CB(skb)->path_mask &= ~mptcp_pi_to_flag(tp->mptcp->path_index);
  14780. +
  14781. + if (TCP_SKB_CB(subskb)->tcp_flags & TCPHDR_FIN) {
  14782. + /* If it is a subflow-fin we must leave it on the
  14783. + * subflow-send-queue, so that the probe-timer
  14784. + * can retransmit it.
  14785. + */
  14786. + if (!tp->packets_out && !inet_csk(sk)->icsk_pending)
  14787. + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
  14788. + inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
  14789. + } else if (mptcp_is_data_fin(subskb) &&
  14790. + TCP_SKB_CB(subskb)->end_seq == TCP_SKB_CB(subskb)->seq) {
  14791. + /* An empty data-fin has not been enqueued on the subflow
  14792. + * and thus we free it.
  14793. + */
  14794. +
  14795. + kfree_skb(subskb);
  14796. + } else {
  14797. + /* In all other cases we remove it from the sub-queue.
  14798. + * Other subflows may send it, or the probe-timer will
  14799. + * handle it.
  14800. + */
  14801. + tcp_advance_send_head(sk, subskb);
  14802. +
  14803. + /* tcp_add_write_queue_tail initialized highest_sack. We have
  14804. + * to reset it, if necessary.
  14805. + */
  14806. + if (tp->highest_sack == subskb)
  14807. + tp->highest_sack = NULL;
  14808. +
  14809. + tcp_unlink_write_queue(subskb, sk);
  14810. + tp->write_seq -= subskb->len;
  14811. + sk_wmem_free_skb(sk, subskb);
  14812. + }
  14813. +}
  14814. +
  14815. +/* Function to create two new TCP segments. Shrinks the given segment
  14816. + * to the specified size and appends a new segment with the rest of the
  14817. + * packet to the list. This won't be called frequently, I hope.
  14818. + * Remember, these are still headerless SKBs at this point.
  14819. + */
  14820. +int mptcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  14821. + unsigned int mss_now, int reinject)
  14822. +{
  14823. + struct tcp_sock *tp = tcp_sk(sk);
  14824. + struct sk_buff *buff;
  14825. + int nsize, old_factor;
  14826. + int nlen;
  14827. + u8 flags;
  14828. + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
  14829. + MPTCP_SUB_LEN_SEQ_ALIGN;
  14830. + char dss[MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
  14831. + MPTCP_SUB_LEN_SEQ_ALIGN];
  14832. +
  14833. + if (WARN_ON(len > skb->len))
  14834. + return -EINVAL;
  14835. +
  14836. + /* DSS-option must be recovered afterwards. */
  14837. + if (!is_meta_sk(sk))
  14838. + memcpy(dss, skb->data - dsslen, dsslen);
  14839. +
  14840. + nsize = skb_headlen(skb) - len;
  14841. + if (nsize < 0)
  14842. + nsize = 0;
  14843. +
  14844. + if (skb_cloned(skb)) {
  14845. + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
  14846. + return -ENOMEM;
  14847. + /* Recover dss-option */
  14848. + if (!is_meta_sk(sk))
  14849. + memcpy(skb->data - dsslen, dss, dsslen);
  14850. + }
  14851. +
  14852. + /* Get a new skb... force flag on. */
  14853. + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
  14854. + if (buff == NULL)
  14855. + return -ENOMEM; /* We'll just try again later. */
  14856. +
  14857. + /* See below - if reinject == 1, the buff will be added to the reinject-
  14858. + * queue, which is currently not part of the memory-accounting.
  14859. + */
  14860. + if (reinject != 1) {
  14861. + sk->sk_wmem_queued += buff->truesize;
  14862. + sk_mem_charge(sk, buff->truesize);
  14863. + }
  14864. + nlen = skb->len - len - nsize;
  14865. + buff->truesize += nlen;
  14866. + skb->truesize -= nlen;
  14867. +
  14868. + /* Correct the sequence numbers. */
  14869. + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  14870. + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  14871. + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  14872. +
  14873. + /* PSH and FIN should only be set in the second packet. */
  14874. + flags = TCP_SKB_CB(skb)->tcp_flags;
  14875. + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
  14876. + TCP_SKB_CB(buff)->tcp_flags = flags;
  14877. + TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
  14878. +
  14879. + flags = TCP_SKB_CB(skb)->mptcp_flags;
  14880. + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
  14881. + TCP_SKB_CB(buff)->mptcp_flags = flags;
  14882. +
  14883. + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
  14884. + /* Copy and checksum data tail into the new buffer. */
  14885. + buff->csum = csum_partial_copy_nocheck(skb->data + len,
  14886. + skb_put(buff, nsize),
  14887. + nsize, 0);
  14888. +
  14889. + skb_trim(skb, len);
  14890. +
  14891. + skb->csum = csum_block_sub(skb->csum, buff->csum, len);
  14892. + } else {
  14893. + skb->ip_summed = CHECKSUM_PARTIAL;
  14894. + skb_split(skb, buff, len);
  14895. + }
  14896. +
  14897. + /* We lost the dss-option when creating buff - put it back! */
  14898. + if (!is_meta_sk(sk))
  14899. + memcpy(buff->data - dsslen, dss, dsslen);
  14900. +
  14901. + buff->ip_summed = skb->ip_summed;
  14902. +
  14903. + /* Looks stupid, but our code really uses when of
  14904. + * skbs, which it never sent before. --ANK
  14905. + */
  14906. + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
  14907. + buff->tstamp = skb->tstamp;
  14908. +
  14909. + old_factor = tcp_skb_pcount(skb);
  14910. +
  14911. + /* Fix up tso_factor for both original and new SKB. */
  14912. + tcp_set_skb_tso_segs(sk, skb, mss_now);
  14913. + tcp_set_skb_tso_segs(sk, buff, mss_now);
  14914. +
  14915. + /* If this packet has been sent out already, we must
  14916. + * adjust the various packet counters.
  14917. + */
  14918. + if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) {
  14919. + int diff = old_factor - tcp_skb_pcount(skb) -
  14920. + tcp_skb_pcount(buff);
  14921. +
  14922. + if (diff)
  14923. + tcp_adjust_pcount(sk, skb, diff);
  14924. + }
  14925. +
  14926. + /* Link BUFF into the send queue. */
  14927. + skb_header_release(buff);
  14928. + if (reinject == 1)
  14929. + __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff);
  14930. + else
  14931. + tcp_insert_write_queue_after(skb, buff, sk);
  14932. +
  14933. + return 0;
  14934. +}
  14935. +
  14936. +int mptso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  14937. + unsigned int mss_now, gfp_t gfp, int reinject)
  14938. +{
  14939. + struct sk_buff *buff;
  14940. + int nlen = skb->len - len, old_factor;
  14941. + u8 flags;
  14942. + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
  14943. + MPTCP_SUB_LEN_SEQ_ALIGN;
  14944. +
  14945. + /* All of a TSO frame must be composed of paged data. */
  14946. + if (skb->len != skb->data_len)
  14947. + return mptcp_fragment(sk, skb, len, mss_now, reinject);
  14948. +
  14949. + buff = sk_stream_alloc_skb(sk, 0, gfp);
  14950. + if (unlikely(buff == NULL))
  14951. + return -ENOMEM;
  14952. +
  14953. + /* See below - if reinject == 1, the buff will be added to the reinject-
  14954. + * queue, which is currently not part of the memory-accounting.
  14955. + */
  14956. + if (reinject != 1) {
  14957. + sk->sk_wmem_queued += buff->truesize;
  14958. + sk_mem_charge(sk, buff->truesize);
  14959. + }
  14960. + buff->truesize += nlen;
  14961. + skb->truesize -= nlen;
  14962. +
  14963. + /* Correct the sequence numbers. */
  14964. + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  14965. + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  14966. + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  14967. +
  14968. + /* PSH and FIN should only be set in the second packet. */
  14969. + flags = TCP_SKB_CB(skb)->tcp_flags;
  14970. + TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
  14971. + TCP_SKB_CB(buff)->tcp_flags = flags;
  14972. +
  14973. + flags = TCP_SKB_CB(skb)->mptcp_flags;
  14974. + TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
  14975. + TCP_SKB_CB(buff)->mptcp_flags = flags;
  14976. +
  14977. + /* This packet was never sent out yet, so no SACK bits. */
  14978. + TCP_SKB_CB(buff)->sacked = 0;
  14979. +
  14980. + buff->ip_summed = CHECKSUM_PARTIAL;
  14981. + skb->ip_summed = CHECKSUM_PARTIAL;
  14982. + skb_split(skb, buff, len);
  14983. +
  14984. + /* We lost the dss-option when creating buff - put it back! */
  14985. + if (!is_meta_sk(sk))
  14986. + memcpy(buff->data - dsslen, skb->data - dsslen, dsslen);
  14987. +
  14988. + old_factor = tcp_skb_pcount(skb);
  14989. +
  14990. + /* Fix up tso_factor for both original and new SKB. */
  14991. + tcp_set_skb_tso_segs(sk, skb, mss_now);
  14992. + tcp_set_skb_tso_segs(sk, buff, mss_now);
  14993. +
  14994. + /* If this packet has been sent out already, we must
  14995. + * adjust the various packet counters.
  14996. + */
  14997. + if (!before(tcp_sk(sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq) && reinject != 1) {
  14998. + int diff = old_factor - tcp_skb_pcount(skb) -
  14999. + tcp_skb_pcount(buff);
  15000. +
  15001. + if (diff)
  15002. + tcp_adjust_pcount(sk, skb, diff);
  15003. + }
  15004. +
  15005. + /* Link BUFF into the send queue. */
  15006. + skb_header_release(buff);
  15007. + if (reinject == 1)
  15008. + __skb_queue_after(&tcp_sk(sk)->mpcb->reinject_queue, skb, buff);
  15009. + else
  15010. + tcp_insert_write_queue_after(skb, buff, sk);
  15011. +
  15012. + return 0;
  15013. +}
  15014. +
  15015. +/* Inspired by tcp_write_wakeup */
  15016. +int mptcp_write_wakeup(struct sock *meta_sk)
  15017. +{
  15018. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  15019. + struct sk_buff *skb, *subskb;
  15020. +
  15021. + skb = tcp_send_head(meta_sk);
  15022. + if (skb &&
  15023. + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
  15024. + int err;
  15025. + unsigned int mss;
  15026. + unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
  15027. + struct sock *subsk = get_available_subflow(meta_sk, skb, &mss);
  15028. + if (!subsk)
  15029. + return -1;
  15030. +
  15031. + if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
  15032. + meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
  15033. +
  15034. + /* We are probing the opening of a window
  15035. + * but the window size is != 0
  15036. + * must have been a result SWS avoidance ( sender )
  15037. + */
  15038. + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
  15039. + skb->len > mss) {
  15040. + seg_size = min(seg_size, mss);
  15041. + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
  15042. + if (mptcp_fragment(meta_sk, skb, seg_size, mss, 0))
  15043. + return -1;
  15044. + } else if (!tcp_skb_pcount(skb)) {
  15045. + tcp_set_skb_tso_segs(meta_sk, skb, mss);
  15046. + }
  15047. +
  15048. + subskb = mptcp_skb_entail(subsk, skb, 0);
  15049. + if (!subskb)
  15050. + return -1;
  15051. +
  15052. + TCP_SKB_CB(subskb)->tcp_flags |= TCPHDR_PSH;
  15053. + TCP_SKB_CB(skb)->when = tcp_time_stamp;
  15054. + TCP_SKB_CB(subskb)->when = tcp_time_stamp;
  15055. + err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC);
  15056. + if (unlikely(err)) {
  15057. + mptcp_transmit_skb_failed(subsk, skb, subskb);
  15058. + return err;
  15059. + }
  15060. +
  15061. + mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
  15062. + TCP_SKB_CB(skb)->seq);
  15063. + tcp_event_new_data_sent(meta_sk, skb);
  15064. + mptcp_sub_event_new_data_sent(subsk, subskb, skb);
  15065. +
  15066. + return 0;
  15067. + } else {
  15068. + struct sock *sk_it;
  15069. + int ans = 0;
  15070. +
  15071. + if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
  15072. + meta_tp->snd_una + 0xFFFF)) {
  15073. + mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
  15074. + if (mptcp_sk_can_send_ack(sk_it))
  15075. + tcp_xmit_probe_skb(sk_it, 1);
  15076. + }
  15077. + }
  15078. +
  15079. + /* At least one of the tcp_xmit_probe_skb's has to succeed */
  15080. + mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
  15081. + int ret;
  15082. +
  15083. + if (!mptcp_sk_can_send_ack(sk_it))
  15084. + continue;
  15085. +
  15086. + ret = tcp_xmit_probe_skb(sk_it, 0);
  15087. + if (unlikely(ret > 0))
  15088. + ans = ret;
  15089. + }
  15090. + return ans;
  15091. + }
  15092. +}
  15093. +
  15094. +static void mptcp_find_and_set_pathmask(struct sock *meta_sk, struct sk_buff *skb)
  15095. +{
  15096. + struct sk_buff *skb_it;
  15097. +
  15098. + skb_it = tcp_write_queue_head(meta_sk);
  15099. +
  15100. + tcp_for_write_queue_from(skb_it, meta_sk) {
  15101. + if (skb_it == tcp_send_head(meta_sk))
  15102. + break;
  15103. +
  15104. + if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
  15105. + TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
  15106. + break;
  15107. + }
  15108. + }
  15109. +}
  15110. +
  15111. +static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
  15112. +{
  15113. + struct sock *meta_sk;
  15114. + struct tcp_sock *tp = tcp_sk(sk), *tp_it;
  15115. + struct sk_buff *skb_head;
  15116. +
  15117. + if (tp->mpcb->cnt_subflows == 1)
  15118. + return NULL;
  15119. +
  15120. + meta_sk = mptcp_meta_sk(sk);
  15121. + skb_head = tcp_write_queue_head(meta_sk);
  15122. +
  15123. + if (!skb_head || skb_head == tcp_send_head(meta_sk))
  15124. + return NULL;
  15125. +
  15126. + /* If penalization is optional (coming from mptcp_next_segment() and
  15127. + * We are not send-buffer-limited we do not penalize. The retransmission
  15128. + * is just an optimization to fix the idle-time due to the delay before
  15129. + * we wake up the application.
  15130. + */
  15131. + if (!penal && sk_stream_memory_free(meta_sk))
  15132. + goto retrans;
  15133. +
  15134. + /* Only penalize again after an RTT has elapsed */
  15135. + if (tcp_time_stamp - tp->mptcp->last_rbuf_opti < tp->srtt >> 3)
  15136. + goto retrans;
  15137. +
  15138. + /* Half the cwnd of the slow flow */
  15139. + mptcp_for_each_tp(tp->mpcb, tp_it) {
  15140. + if (tp_it != tp &&
  15141. + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
  15142. + if (tp->srtt < tp_it->srtt && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
  15143. + tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
  15144. + if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)
  15145. + tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
  15146. +
  15147. + tp->mptcp->last_rbuf_opti = tcp_time_stamp;
  15148. + }
  15149. + break;
  15150. + }
  15151. + }
  15152. +
  15153. +retrans:
  15154. +
  15155. + /* Segment not yet injected into this path? Take it!!! */
  15156. + if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
  15157. + bool do_retrans = false;
  15158. + mptcp_for_each_tp(tp->mpcb, tp_it) {
  15159. + if (tp_it != tp &&
  15160. + TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
  15161. + if (tp_it->snd_cwnd <= 4) {
  15162. + do_retrans = true;
  15163. + break;
  15164. + }
  15165. +
  15166. + if (4 * tp->srtt >= tp_it->srtt) {
  15167. + do_retrans = false;
  15168. + break;
  15169. + } else {
  15170. + do_retrans = true;
  15171. + }
  15172. + }
  15173. + }
  15174. +
  15175. + if (do_retrans)
  15176. + return skb_head;
  15177. + }
  15178. + return NULL;
  15179. +}
  15180. +
  15181. +int mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
  15182. + int push_one, gfp_t gfp)
  15183. +{
  15184. + struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
  15185. + struct sock *subsk;
  15186. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  15187. + struct sk_buff *skb;
  15188. + unsigned int tso_segs, old_factor, sent_pkts;
  15189. + int cwnd_quota;
  15190. + int result;
  15191. + int reinject = 0;
  15192. +
  15193. + sent_pkts = 0;
  15194. +
  15195. + /* Currently mtu-probing is not done in MPTCP */
  15196. + if (!push_one && 0) {
  15197. + /* Do MTU probing. */
  15198. + result = tcp_mtu_probe(meta_sk);
  15199. + if (!result)
  15200. + return 0;
  15201. + else if (result > 0)
  15202. + sent_pkts = 1;
  15203. + }
  15204. +
  15205. + while ((skb = mptcp_next_segment(meta_sk, &reinject))) {
  15206. + unsigned int limit;
  15207. + struct sk_buff *subskb = NULL;
  15208. + u32 noneligible = mpcb->noneligible;
  15209. +
  15210. + if (reinject == 1) {
  15211. + if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
  15212. + /* Segment already reached the peer, take the next one */
  15213. + __skb_unlink(skb, &mpcb->reinject_queue);
  15214. + __kfree_skb(skb);
  15215. + continue;
  15216. + }
  15217. +
  15218. + /* Reinjection and it is coming from a subflow? We need
  15219. + * to find out the path-mask from the meta-write-queue
  15220. + * to properly select a subflow.
  15221. + */
  15222. + if (!TCP_SKB_CB(skb)->path_mask)
  15223. + mptcp_find_and_set_pathmask(meta_sk, skb);
  15224. + }
  15225. +
  15226. +subflow:
  15227. + subsk = get_available_subflow(meta_sk, skb, &mss_now);
  15228. + if (!subsk)
  15229. + break;
  15230. + subtp = tcp_sk(subsk);
  15231. +
  15232. + /* Since all subsocks are locked before calling the scheduler,
  15233. + * the tcp_send_head should not change.
  15234. + */
  15235. + BUG_ON(!reinject && tcp_send_head(meta_sk) != skb);
  15236. +retry:
  15237. + /* If the segment was cloned (e.g. a meta retransmission),
  15238. + * the header must be expanded/copied so that there is no
  15239. + * corruption of TSO information.
  15240. + */
  15241. + if (skb_unclone(skb, GFP_ATOMIC))
  15242. + break;
  15243. +
  15244. + old_factor = tcp_skb_pcount(skb);
  15245. + tcp_set_skb_tso_segs(meta_sk, skb, mss_now);
  15246. + tso_segs = tcp_skb_pcount(skb);
  15247. +
  15248. + if (reinject == -1) {
  15249. + /* The packet has already once been sent, so if we
  15250. + * change the pcount here we have to adjust packets_out
  15251. + * in the meta-sk
  15252. + */
  15253. + int diff = old_factor - tso_segs;
  15254. +
  15255. + if (diff)
  15256. + tcp_adjust_pcount(meta_sk, skb, diff);
  15257. + }
  15258. +
  15259. + cwnd_quota = tcp_cwnd_test(subtp, skb);
  15260. + if (!cwnd_quota) {
  15261. + /* May happen due to two cases:
  15262. + *
  15263. + * - if at the first selection we circumvented
  15264. + * the test due to a DATA_FIN (and got rejected at
  15265. + * tcp_snd_wnd_test), but the reinjected segment is not
  15266. + * a DATA_FIN.
  15267. + * - if we take a DATA_FIN with data, but
  15268. + * tcp_set_skb_tso_segs() increases the number of
  15269. + * tso_segs to something > 1. Then, cwnd_test might
  15270. + * reject it.
  15271. + */
  15272. + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
  15273. + continue;
  15274. + }
  15275. +
  15276. + if (!reinject && unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now))) {
  15277. + skb = mptcp_rcv_buf_optimization(subsk, 1);
  15278. + if (skb) {
  15279. + reinject = -1;
  15280. + goto retry;
  15281. + }
  15282. + break;
  15283. + }
  15284. +
  15285. + if (tso_segs == 1) {
  15286. + if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
  15287. + (tcp_skb_is_last(meta_sk, skb) ?
  15288. + nonagle : TCP_NAGLE_PUSH))))
  15289. + break;
  15290. + } else {
  15291. + /* Do not try to defer the transmission of a reinjected
  15292. + * segment. Send it directly.
  15293. + * If it is not possible to send the TSO segment on the
  15294. + * best subflow right now try to look for another subflow.
  15295. + * If there is no subflow available defer the segment to avoid
  15296. + * the call to mptso_fragment.
  15297. + */
  15298. + if (!push_one && !reinject && tcp_tso_should_defer(subsk, skb)) {
  15299. + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
  15300. + goto subflow;
  15301. + }
  15302. + }
  15303. +
  15304. + limit = mss_now;
  15305. + if (tso_segs > 1 && !tcp_urg_mode(meta_tp))
  15306. + limit = tcp_mss_split_point(subsk, skb, mss_now,
  15307. + min_t(unsigned int,
  15308. + cwnd_quota,
  15309. + subsk->sk_gso_max_segs),
  15310. + nonagle);
  15311. +
  15312. + if (skb->len > limit &&
  15313. + unlikely(mptso_fragment(meta_sk, skb, limit, mss_now, gfp, reinject)))
  15314. + break;
  15315. +
  15316. + subskb = mptcp_skb_entail(subsk, skb, reinject);
  15317. + if (!subskb)
  15318. + break;
  15319. +
  15320. + mpcb->noneligible = noneligible;
  15321. + TCP_SKB_CB(skb)->when = tcp_time_stamp;
  15322. + TCP_SKB_CB(subskb)->when = tcp_time_stamp;
  15323. + if (unlikely(tcp_transmit_skb(subsk, subskb, 1, gfp))) {
  15324. + mptcp_transmit_skb_failed(subsk, skb, subskb);
  15325. + mpcb->noneligible |= mptcp_pi_to_flag(subtp->mptcp->path_index);
  15326. + continue;
  15327. + }
  15328. +
  15329. + if (!reinject) {
  15330. + mptcp_check_sndseq_wrap(meta_tp,
  15331. + TCP_SKB_CB(skb)->end_seq -
  15332. + TCP_SKB_CB(skb)->seq);
  15333. + tcp_event_new_data_sent(meta_sk, skb);
  15334. + }
  15335. +
  15336. + tcp_minshall_update(meta_tp, mss_now, skb);
  15337. + sent_pkts += tcp_skb_pcount(skb);
  15338. + tcp_sk(subsk)->mptcp->sent_pkts += tcp_skb_pcount(skb);
  15339. +
  15340. + mptcp_sub_event_new_data_sent(subsk, subskb, skb);
  15341. +
  15342. + if (reinject > 0) {
  15343. + __skb_unlink(skb, &mpcb->reinject_queue);
  15344. + kfree_skb(skb);
  15345. + }
  15346. +
  15347. + if (push_one)
  15348. + break;
  15349. + }
  15350. +
  15351. + mpcb->noneligible = 0;
  15352. +
  15353. + if (likely(sent_pkts)) {
  15354. + mptcp_for_each_sk(mpcb, subsk) {
  15355. + subtp = tcp_sk(subsk);
  15356. + if (subtp->mptcp->sent_pkts) {
  15357. + if (tcp_in_cwnd_reduction(subsk))
  15358. + subtp->prr_out += subtp->mptcp->sent_pkts;
  15359. + tcp_cwnd_validate(subsk);
  15360. + subtp->mptcp->sent_pkts = 0;
  15361. + }
  15362. + }
  15363. + return 0;
  15364. + }
  15365. +
  15366. + return !meta_tp->packets_out && tcp_send_head(meta_sk);
  15367. +}
  15368. +
  15369. +void mptcp_write_space(struct sock *sk)
  15370. +{
  15371. + mptcp_push_pending_frames(mptcp_meta_sk(sk));
  15372. +}
  15373. +
  15374. +u32 __mptcp_select_window(struct sock *sk)
  15375. +{
  15376. + struct inet_connection_sock *icsk = inet_csk(sk);
  15377. + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
  15378. + int mss, free_space, full_space, window;
  15379. +
  15380. + /* MSS for the peer's data. Previous versions used mss_clamp
  15381. + * here. I don't know if the value based on our guesses
  15382. + * of peer's MSS is better for the performance. It's more correct
  15383. + * but may be worse for the performance because of rcv_mss
  15384. + * fluctuations. --SAW 1998/11/1
  15385. + */
  15386. + mss = icsk->icsk_ack.rcv_mss;
  15387. + free_space = tcp_space(sk);
  15388. + full_space = min_t(int, meta_tp->window_clamp,
  15389. + tcp_full_space(sk));
  15390. +
  15391. + if (mss > full_space)
  15392. + mss = full_space;
  15393. +
  15394. + if (free_space < (full_space >> 1)) {
  15395. + icsk->icsk_ack.quick = 0;
  15396. +
  15397. + if (tcp_memory_pressure)
  15398. + /* TODO this has to be adapted when we support different
  15399. + * MSS's among the subflows.
  15400. + */
  15401. + meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
  15402. + 4U * meta_tp->advmss);
  15403. +
  15404. + if (free_space < mss)
  15405. + return 0;
  15406. + }
  15407. +
  15408. + if (free_space > meta_tp->rcv_ssthresh)
  15409. + free_space = meta_tp->rcv_ssthresh;
  15410. +
  15411. + /* Don't do rounding if we are using window scaling, since the
  15412. + * scaled window will not line up with the MSS boundary anyway.
  15413. + */
  15414. + window = meta_tp->rcv_wnd;
  15415. + if (tp->rx_opt.rcv_wscale) {
  15416. + window = free_space;
  15417. +
  15418. + /* Advertise enough space so that it won't get scaled away.
  15419. + * Import case: prevent zero window announcement if
  15420. + * 1<<rcv_wscale > mss.
  15421. + */
  15422. + if (((window >> tp->rx_opt.rcv_wscale) << tp->
  15423. + rx_opt.rcv_wscale) != window)
  15424. + window = (((window >> tp->rx_opt.rcv_wscale) + 1)
  15425. + << tp->rx_opt.rcv_wscale);
  15426. + } else {
  15427. + /* Get the largest window that is a nice multiple of mss.
  15428. + * Window clamp already applied above.
  15429. + * If our current window offering is within 1 mss of the
  15430. + * free space we just keep it. This prevents the divide
  15431. + * and multiply from happening most of the time.
  15432. + * We also don't do any window rounding when the free space
  15433. + * is too small.
  15434. + */
  15435. + if (window <= free_space - mss || window > free_space)
  15436. + window = (free_space / mss) * mss;
  15437. + else if (mss == full_space &&
  15438. + free_space > window + (full_space >> 1))
  15439. + window = free_space;
  15440. + }
  15441. +
  15442. + return window;
  15443. +}
  15444. +
  15445. +void mptcp_syn_options(struct sock *sk, struct tcp_out_options *opts,
  15446. + unsigned *remaining)
  15447. +{
  15448. + struct tcp_sock *tp = tcp_sk(sk);
  15449. +
  15450. + opts->options |= OPTION_MPTCP;
  15451. + if (is_master_tp(tp)) {
  15452. + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
  15453. + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
  15454. + opts->mp_capable.sender_key = tp->mptcp_loc_key;
  15455. + opts->dss_csum = !!sysctl_mptcp_checksum;
  15456. + } else {
  15457. + struct mptcp_cb *mpcb = tp->mpcb;
  15458. +
  15459. + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
  15460. + *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
  15461. + opts->mp_join_syns.token = mpcb->mptcp_rem_token;
  15462. + opts->addr_id = tp->mptcp->loc_id;
  15463. + opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
  15464. + }
  15465. +}
  15466. +
  15467. +void mptcp_synack_options(struct request_sock *req,
  15468. + struct tcp_out_options *opts, unsigned *remaining)
  15469. +{
  15470. + struct mptcp_request_sock *mtreq;
  15471. + mtreq = mptcp_rsk(req);
  15472. +
  15473. + opts->options |= OPTION_MPTCP;
  15474. + /* MPCB not yet set - thus it's a new MPTCP-session */
  15475. + if (!mtreq->mpcb) {
  15476. + opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
  15477. + opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
  15478. + opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
  15479. + *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
  15480. + } else {
  15481. + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
  15482. + opts->mp_join_syns.sender_truncated_mac =
  15483. + mtreq->mptcp_hash_tmac;
  15484. + opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
  15485. + opts->addr_id = mtreq->loc_id;
  15486. + *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
  15487. + }
  15488. +}
  15489. +
  15490. +void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
  15491. + struct tcp_out_options *opts, unsigned *size)
  15492. +{
  15493. + struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
  15494. + struct mptcp_cb *mpcb = tp->mpcb;
  15495. + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
  15496. +
  15497. + /* In fallback mp_fail-mode, we have to repeat it until the fallback
  15498. + * has been done by the sender
  15499. + */
  15500. + if (unlikely(tp->mptcp->send_mp_fail)) {
  15501. + opts->options |= OPTION_MPTCP;
  15502. + opts->mptcp_options |= OPTION_MP_FAIL;
  15503. + opts->data_ack = (__u32)(mpcb->csum_cutoff_seq >> 32);
  15504. + opts->data_seq = (__u32)mpcb->csum_cutoff_seq;
  15505. + *size += MPTCP_SUB_LEN_FAIL;
  15506. + return;
  15507. + }
  15508. +
  15509. + if (unlikely(tp->send_mp_fclose)) {
  15510. + opts->options |= OPTION_MPTCP;
  15511. + opts->mptcp_options |= OPTION_MP_FCLOSE;
  15512. + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
  15513. + *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
  15514. + return;
  15515. + }
  15516. +
  15517. + /* 1. If we are the sender of the infinite-mapping, we need the
  15518. + * MPTCPHDR_INF-flag, because a retransmission of the
  15519. + * infinite-announcment still needs the mptcp-option.
  15520. + *
  15521. + * We need infinite_cutoff_seq, because retransmissions from before
  15522. + * the infinite-cutoff-moment still need the MPTCP-signalling to stay
  15523. + * consistent.
  15524. + *
  15525. + * 2. If we are the receiver of the infinite-mapping, we always skip
  15526. + * mptcp-options, because acknowledgments from before the
  15527. + * infinite-mapping point have already been sent out.
  15528. + *
  15529. + * I know, the whole infinite-mapping stuff is ugly...
  15530. + *
  15531. + * TODO: Handle wrapped data-sequence numbers
  15532. + * (even if it's very unlikely)
  15533. + */
  15534. + if (unlikely(mpcb->infinite_mapping_snd) &&
  15535. + tp->mptcp->fully_established &&
  15536. + ((mpcb->send_infinite_mapping && tcb &&
  15537. + !(tcb->mptcp_flags & MPTCPHDR_INF) &&
  15538. + !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
  15539. + !mpcb->send_infinite_mapping))
  15540. + return;
  15541. +
  15542. + if (unlikely(tp->mptcp->include_mpc)) {
  15543. + opts->options |= OPTION_MPTCP;
  15544. + opts->mptcp_options |= OPTION_MP_CAPABLE |
  15545. + OPTION_TYPE_ACK;
  15546. + *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
  15547. + opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
  15548. + opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
  15549. + opts->dss_csum = mpcb->dss_csum;
  15550. +
  15551. + if (skb)
  15552. + tp->mptcp->include_mpc = 0;
  15553. + }
  15554. + if (unlikely(tp->mptcp->pre_established)) {
  15555. + opts->options |= OPTION_MPTCP;
  15556. + opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
  15557. + *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
  15558. + }
  15559. +
  15560. + if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
  15561. + opts->options |= OPTION_MPTCP;
  15562. + opts->mptcp_options |= OPTION_DATA_ACK;
  15563. + /* If !skb, we come from tcp_current_mss and thus we always
  15564. + * assume that the DSS-option will be set for the data-packet.
  15565. + */
  15566. + if (skb && !mptcp_is_data_seq(skb)) {
  15567. + opts->data_ack = meta_tp->rcv_nxt;
  15568. +
  15569. + *size += MPTCP_SUB_LEN_ACK_ALIGN;
  15570. + } else {
  15571. + opts->data_ack = meta_tp->rcv_nxt;
  15572. +
  15573. + /* Doesn't matter, if csum included or not. It will be
  15574. + * either 10 or 12, and thus aligned = 12
  15575. + */
  15576. + *size += MPTCP_SUB_LEN_ACK_ALIGN +
  15577. + MPTCP_SUB_LEN_SEQ_ALIGN;
  15578. + }
  15579. +
  15580. + *size += MPTCP_SUB_LEN_DSS_ALIGN;
  15581. + }
  15582. +
  15583. + if (mpcb->pm_ops->addr_signal)
  15584. + mpcb->pm_ops->addr_signal(sk, size, opts, skb);
  15585. +
  15586. + if (unlikely(tp->mptcp->send_mp_prio) &&
  15587. + MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
  15588. + opts->options |= OPTION_MPTCP;
  15589. + opts->mptcp_options |= OPTION_MP_PRIO;
  15590. + if (skb)
  15591. + tp->mptcp->send_mp_prio = 0;
  15592. + *size += MPTCP_SUB_LEN_PRIO_ALIGN;
  15593. + }
  15594. +
  15595. + return;
  15596. +}
  15597. +
  15598. +u16 mptcp_select_window(struct sock *sk)
  15599. +{
  15600. + u16 new_win = tcp_select_window(sk);
  15601. + struct tcp_sock *tp = tcp_sk(sk);
  15602. + struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
  15603. +
  15604. + meta_tp->rcv_wnd = tp->rcv_wnd;
  15605. + meta_tp->rcv_wup = meta_tp->rcv_nxt;
  15606. +
  15607. + return new_win;
  15608. +}
  15609. +
  15610. +void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
  15611. + struct tcp_out_options *opts,
  15612. + struct sk_buff *skb)
  15613. +{
  15614. + if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
  15615. + struct mp_capable *mpc = (struct mp_capable *)ptr;
  15616. +
  15617. + mpc->kind = TCPOPT_MPTCP;
  15618. +
  15619. + if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
  15620. + (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
  15621. + mpc->sender_key = opts->mp_capable.sender_key;
  15622. + mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
  15623. + ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
  15624. + } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
  15625. + mpc->sender_key = opts->mp_capable.sender_key;
  15626. + mpc->receiver_key = opts->mp_capable.receiver_key;
  15627. + mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
  15628. + ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
  15629. + }
  15630. +
  15631. + mpc->sub = MPTCP_SUB_CAPABLE;
  15632. + mpc->ver = 0;
  15633. + mpc->a = opts->dss_csum;
  15634. + mpc->b = 0;
  15635. + mpc->rsv = 0;
  15636. + mpc->h = 1;
  15637. + }
  15638. +
  15639. + if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
  15640. + struct mp_join *mpj = (struct mp_join *)ptr;
  15641. +
  15642. + mpj->kind = TCPOPT_MPTCP;
  15643. + mpj->sub = MPTCP_SUB_JOIN;
  15644. + mpj->rsv = 0;
  15645. + mpj->addr_id = opts->addr_id;
  15646. +
  15647. + if (OPTION_TYPE_SYN & opts->mptcp_options) {
  15648. + mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
  15649. + mpj->u.syn.token = opts->mp_join_syns.token;
  15650. + mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
  15651. + mpj->b = tp->mptcp->low_prio;
  15652. + ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
  15653. + } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
  15654. + mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
  15655. + mpj->u.synack.mac =
  15656. + opts->mp_join_syns.sender_truncated_mac;
  15657. + mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
  15658. + mpj->b = tp->mptcp->low_prio;
  15659. + ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
  15660. + } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
  15661. + mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
  15662. + memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
  15663. + ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
  15664. + }
  15665. + }
  15666. + if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
  15667. + struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
  15668. +
  15669. + mpadd->kind = TCPOPT_MPTCP;
  15670. + if (opts->add_addr_v4) {
  15671. + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
  15672. + mpadd->sub = MPTCP_SUB_ADD_ADDR;
  15673. + mpadd->ipver = 4;
  15674. + mpadd->addr_id = opts->add_addr4.addr_id;
  15675. + mpadd->u.v4.addr = opts->add_addr4.addr;
  15676. + ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
  15677. + } else if (opts->add_addr_v6) {
  15678. + mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
  15679. + mpadd->sub = MPTCP_SUB_ADD_ADDR;
  15680. + mpadd->ipver = 6;
  15681. + mpadd->addr_id = opts->add_addr6.addr_id;
  15682. + memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
  15683. + sizeof(mpadd->u.v6.addr));
  15684. + ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
  15685. + }
  15686. + }
  15687. + if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
  15688. + struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
  15689. + u8 *addrs_id;
  15690. + int id, len, len_align;
  15691. +
  15692. + len = mptcp_sub_len_remove_addr(opts->remove_addrs);
  15693. + len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
  15694. +
  15695. + mprem->kind = TCPOPT_MPTCP;
  15696. + mprem->len = len;
  15697. + mprem->sub = MPTCP_SUB_REMOVE_ADDR;
  15698. + mprem->rsv = 0;
  15699. + addrs_id = &mprem->addrs_id;
  15700. +
  15701. + mptcp_for_each_bit_set(opts->remove_addrs, id)
  15702. + *(addrs_id++) = id;
  15703. +
  15704. + /* Fill the rest with NOP's */
  15705. + if (len_align > len) {
  15706. + int i;
  15707. + for (i = 0; i < len_align - len; i++)
  15708. + *(addrs_id++) = TCPOPT_NOP;
  15709. + }
  15710. +
  15711. + ptr += len_align >> 2;
  15712. + }
  15713. + if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
  15714. + struct mp_fail *mpfail = (struct mp_fail *)ptr;
  15715. +
  15716. + mpfail->kind = TCPOPT_MPTCP;
  15717. + mpfail->len = MPTCP_SUB_LEN_FAIL;
  15718. + mpfail->sub = MPTCP_SUB_FAIL;
  15719. + mpfail->rsv1 = 0;
  15720. + mpfail->rsv2 = 0;
  15721. + mpfail->data_seq = htonll(((u64)opts->data_ack << 32) | opts->data_seq);
  15722. +
  15723. + ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
  15724. + }
  15725. + if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
  15726. + struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
  15727. +
  15728. + mpfclose->kind = TCPOPT_MPTCP;
  15729. + mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
  15730. + mpfclose->sub = MPTCP_SUB_FCLOSE;
  15731. + mpfclose->rsv1 = 0;
  15732. + mpfclose->rsv2 = 0;
  15733. + mpfclose->key = opts->mp_capable.receiver_key;
  15734. +
  15735. + ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
  15736. + }
  15737. +
  15738. + if (OPTION_DATA_ACK & opts->mptcp_options) {
  15739. + if (!mptcp_is_data_seq(skb)) {
  15740. + struct mp_dss *mdss = (struct mp_dss *)ptr;
  15741. +
  15742. + mdss->kind = TCPOPT_MPTCP;
  15743. + mdss->sub = MPTCP_SUB_DSS;
  15744. + mdss->rsv1 = 0;
  15745. + mdss->rsv2 = 0;
  15746. + mdss->F = 0;
  15747. + mdss->m = 0;
  15748. + mdss->M = 0;
  15749. + mdss->a = 0;
  15750. + mdss->A = 1;
  15751. + mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
  15752. +
  15753. + ptr++;
  15754. + *ptr++ = htonl(opts->data_ack);
  15755. + } else {
  15756. + /**** Just update the data_ack ****/
  15757. +
  15758. + /* Get pointer to data_ack-field. MPTCP is always at
  15759. + * the end of the TCP-options.
  15760. + */
  15761. + /* TODO if we allow sending 64-bit dseq's we have to change "16" */
  15762. + __be32 *dack = (__be32 *)(skb->data + (tcp_hdr(skb)->doff << 2) - 16);
  15763. +
  15764. + *dack = htonl(opts->data_ack);
  15765. + }
  15766. + }
  15767. + if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
  15768. + struct mp_prio *mpprio = (struct mp_prio *)ptr;
  15769. +
  15770. + mpprio->kind = TCPOPT_MPTCP;
  15771. + mpprio->len = MPTCP_SUB_LEN_PRIO;
  15772. + mpprio->sub = MPTCP_SUB_PRIO;
  15773. + mpprio->rsv = 0;
  15774. + mpprio->b = tp->mptcp->low_prio;
  15775. + mpprio->addr_id = TCPOPT_NOP;
  15776. +
  15777. + ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
  15778. + }
  15779. +}
  15780. +
  15781. +/* Returns the next segment to be sent from the mptcp meta-queue.
  15782. + * (chooses the reinject queue if any segment is waiting in it, otherwise,
  15783. + * chooses the normal write queue).
  15784. + * Sets *@reinject to 1 if the returned segment comes from the
  15785. + * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
  15786. + * and sets it to -1 if it is a meta-level retransmission to optimize the
  15787. + * receive-buffer.
  15788. + */
  15789. +struct sk_buff *mptcp_next_segment(struct sock *meta_sk, int *reinject)
  15790. +{
  15791. + struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
  15792. + struct sk_buff *skb = NULL;
  15793. + if (reinject)
  15794. + *reinject = 0;
  15795. +
  15796. + /* If we are in fallback-mode, just take from the meta-send-queue */
  15797. + if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
  15798. + return tcp_send_head(meta_sk);
  15799. +
  15800. + skb = skb_peek(&mpcb->reinject_queue);
  15801. +
  15802. + if (skb) {
  15803. + if (reinject)
  15804. + *reinject = 1;
  15805. + } else {
  15806. + skb = tcp_send_head(meta_sk);
  15807. +
  15808. + if (!skb && meta_sk->sk_socket &&
  15809. + test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
  15810. + sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
  15811. + struct sock *subsk = get_available_subflow(meta_sk, NULL, NULL);
  15812. + if (!subsk)
  15813. + return NULL;
  15814. +
  15815. + skb = mptcp_rcv_buf_optimization(subsk, 0);
  15816. + if (skb && reinject)
  15817. + *reinject = -1;
  15818. + }
  15819. + }
  15820. + return skb;
  15821. +}
  15822. +
  15823. +/* Sends the datafin */
  15824. +void mptcp_send_fin(struct sock *meta_sk)
  15825. +{
  15826. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  15827. + struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
  15828. + int mss_now;
  15829. +
  15830. + if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
  15831. + meta_tp->mpcb->passive_close = 1;
  15832. +
  15833. + /* Optimization, tack on the FIN if we have a queue of
  15834. + * unsent frames. But be careful about outgoing SACKS
  15835. + * and IP options.
  15836. + */
  15837. + mss_now = mptcp_current_mss(meta_sk);
  15838. +
  15839. + if (tcp_send_head(meta_sk) != NULL) {
  15840. + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
  15841. + TCP_SKB_CB(skb)->end_seq++;
  15842. + meta_tp->write_seq++;
  15843. + } else {
  15844. + /* Socket is locked, keep trying until memory is available. */
  15845. + for (;;) {
  15846. + skb = alloc_skb_fclone(MAX_TCP_HEADER,
  15847. + meta_sk->sk_allocation);
  15848. + if (skb)
  15849. + break;
  15850. + yield();
  15851. + }
  15852. + /* Reserve space for headers and prepare control bits. */
  15853. + skb_reserve(skb, MAX_TCP_HEADER);
  15854. +
  15855. + tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
  15856. + TCP_SKB_CB(skb)->end_seq++;
  15857. + TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN | MPTCPHDR_SEQ;
  15858. + tcp_queue_skb(meta_sk, skb);
  15859. + }
  15860. + __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
  15861. +}
  15862. +
  15863. +void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
  15864. +{
  15865. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  15866. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  15867. + struct sock *sk = NULL, *sk_it = NULL, *tmpsk;
  15868. +
  15869. + if (!mpcb->cnt_subflows)
  15870. + return;
  15871. +
  15872. + WARN_ON(meta_tp->send_mp_fclose);
  15873. +
  15874. + /* First - select a socket */
  15875. + sk = mptcp_select_ack_sock(meta_sk, 0);
  15876. +
  15877. + /* May happen if no subflow is in an appropriate state */
  15878. + if (!sk)
  15879. + return;
  15880. +
  15881. + /* We are in infinite mode - just send a reset */
  15882. + if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {
  15883. + sk->sk_err = ECONNRESET;
  15884. + if (tcp_need_reset(sk->sk_state))
  15885. + tcp_send_active_reset(sk, priority);
  15886. + mptcp_sub_force_close(sk);
  15887. + return;
  15888. + }
  15889. +
  15890. +
  15891. + tcp_sk(sk)->send_mp_fclose = 1;
  15892. + /** Reset all other subflows */
  15893. +
  15894. + /* tcp_done must be handled with bh disabled */
  15895. + if (!in_serving_softirq())
  15896. + local_bh_disable();
  15897. +
  15898. + mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
  15899. + if (tcp_sk(sk_it)->send_mp_fclose)
  15900. + continue;
  15901. +
  15902. + sk_it->sk_err = ECONNRESET;
  15903. + if (tcp_need_reset(sk_it->sk_state))
  15904. + tcp_send_active_reset(sk_it, GFP_ATOMIC);
  15905. + mptcp_sub_force_close(sk_it);
  15906. + }
  15907. +
  15908. + if (!in_serving_softirq())
  15909. + local_bh_enable();
  15910. +
  15911. + tcp_send_ack(sk);
  15912. + inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
  15913. +
  15914. + meta_tp->send_mp_fclose = 1;
  15915. +}
  15916. +
  15917. +static void mptcp_ack_retransmit_timer(struct sock *sk)
  15918. +{
  15919. + struct sk_buff *skb;
  15920. + struct tcp_sock *tp = tcp_sk(sk);
  15921. + struct inet_connection_sock *icsk = inet_csk(sk);
  15922. +
  15923. + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
  15924. + goto out; /* Routing failure or similar */
  15925. +
  15926. + if (!tp->retrans_stamp)
  15927. + tp->retrans_stamp = tcp_time_stamp ? : 1;
  15928. +
  15929. + if (tcp_write_timeout(sk)) {
  15930. + tp->mptcp->pre_established = 0;
  15931. + sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
  15932. + tcp_send_active_reset(sk, GFP_ATOMIC);
  15933. + goto out;
  15934. + }
  15935. +
  15936. + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
  15937. + if (skb == NULL) {
  15938. + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
  15939. + jiffies + icsk->icsk_rto);
  15940. + return;
  15941. + }
  15942. +
  15943. + /* Reserve space for headers and prepare control bits */
  15944. + skb_reserve(skb, MAX_TCP_HEADER);
  15945. + tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
  15946. +
  15947. + TCP_SKB_CB(skb)->when = tcp_time_stamp;
  15948. + if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
  15949. + /* Retransmission failed because of local congestion,
  15950. + * do not backoff.
  15951. + */
  15952. + if (!icsk->icsk_retransmits)
  15953. + icsk->icsk_retransmits = 1;
  15954. + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
  15955. + jiffies + icsk->icsk_rto);
  15956. + return;
  15957. + }
  15958. +
  15959. +
  15960. + icsk->icsk_retransmits++;
  15961. + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
  15962. + sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
  15963. + jiffies + icsk->icsk_rto);
  15964. + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) {
  15965. + __sk_dst_reset(sk);
  15966. + }
  15967. +
  15968. +out:;
  15969. +}
  15970. +
  15971. +void mptcp_ack_handler(unsigned long data)
  15972. +{
  15973. + struct sock *sk = (struct sock *)data;
  15974. + struct sock *meta_sk = mptcp_meta_sk(sk);
  15975. +
  15976. + bh_lock_sock(meta_sk);
  15977. + if (sock_owned_by_user(meta_sk)) {
  15978. + /* Try again later */
  15979. + sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
  15980. + jiffies + (HZ / 20));
  15981. + goto out_unlock;
  15982. + }
  15983. +
  15984. + if (sk->sk_state == TCP_CLOSE)
  15985. + goto out_unlock;
  15986. +
  15987. + mptcp_ack_retransmit_timer(sk);
  15988. +
  15989. + sk_mem_reclaim(sk);
  15990. +
  15991. +out_unlock:
  15992. + bh_unlock_sock(meta_sk);
  15993. + sock_put(sk);
  15994. +}
  15995. +
  15996. +/* Similar to tcp_retransmit_skb
  15997. + *
  15998. + * The diff is that we handle the retransmission-stats (retrans_stamp) at the
  15999. + * meta-level.
  16000. + */
  16001. +int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
  16002. +{
  16003. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  16004. + struct sock *subsk;
  16005. + struct sk_buff *subskb;
  16006. + unsigned int limit, tso_segs, mss_now;
  16007. + int err = -1, oldpcount;
  16008. +
  16009. + /* Do not sent more than we queued. 1/4 is reserved for possible
  16010. + * copying overhead: fragmentation, tunneling, mangling etc.
  16011. + *
  16012. + * This is a meta-retransmission thus we check on the meta-socket.
  16013. + */
  16014. + if (atomic_read(&meta_sk->sk_wmem_alloc) >
  16015. + min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
  16016. + return -EAGAIN;
  16017. + }
  16018. +
  16019. + /* We need to make sure that the retransmitted segment can be sent on a
  16020. + * subflow right now. If it is too big, it needs to be fragmented.
  16021. + */
  16022. + subsk = get_available_subflow(meta_sk, skb, &mss_now);
  16023. + if (!subsk) {
  16024. + /* We want to increase icsk_retransmits, thus return 0, so that
  16025. + * mptcp_retransmit_timer enters the desired branch.
  16026. + */
  16027. + err = 0;
  16028. + goto failed;
  16029. + }
  16030. +
  16031. + /* If the segment was cloned (e.g. a meta retransmission), the header
  16032. + * must be expanded/copied so that there is no corruption of TSO
  16033. + * information.
  16034. + */
  16035. + if (skb_unclone(skb, GFP_ATOMIC)) {
  16036. + err = ENOMEM;
  16037. + goto failed;
  16038. + }
  16039. +
  16040. + oldpcount = tcp_skb_pcount(skb);
  16041. + tcp_set_skb_tso_segs(meta_sk, skb, mss_now);
  16042. + tso_segs = tcp_skb_pcount(skb);
  16043. + BUG_ON(!tso_segs);
  16044. +
  16045. + /* The MSS might have changed and so the number of segments. We
  16046. + * need to account for this change.
  16047. + */
  16048. + if (unlikely(oldpcount != tso_segs))
  16049. + tcp_adjust_pcount(meta_sk, skb, oldpcount - tso_segs);
  16050. +
  16051. + limit = mss_now;
  16052. + if (tso_segs > 1 && !tcp_urg_mode(meta_tp))
  16053. + limit = tcp_mss_split_point(subsk, skb, mss_now,
  16054. + min_t(unsigned int,
  16055. + tcp_cwnd_test(tcp_sk(subsk), skb),
  16056. + subsk->sk_gso_max_segs),
  16057. + TCP_NAGLE_OFF);
  16058. +
  16059. + if (skb->len > limit &&
  16060. + unlikely(mptso_fragment(meta_sk, skb, limit, mss_now,
  16061. + GFP_ATOMIC, 0)))
  16062. + goto failed;
  16063. +
  16064. + subskb = mptcp_skb_entail(subsk, skb, -1);
  16065. + if (!subskb)
  16066. + goto failed;
  16067. +
  16068. + TCP_SKB_CB(skb)->when = tcp_time_stamp;
  16069. + TCP_SKB_CB(subskb)->when = tcp_time_stamp;
  16070. + err = tcp_transmit_skb(subsk, subskb, 1, GFP_ATOMIC);
  16071. + if (!err) {
  16072. + /* Update global TCP statistics. */
  16073. + TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
  16074. +
  16075. + /* Diff to tcp_retransmit_skb */
  16076. +
  16077. + /* Save stamp of the first retransmit. */
  16078. + if (!meta_tp->retrans_stamp)
  16079. + meta_tp->retrans_stamp = TCP_SKB_CB(subskb)->when;
  16080. + mptcp_sub_event_new_data_sent(subsk, subskb, skb);
  16081. + } else {
  16082. + mptcp_transmit_skb_failed(subsk, skb, subskb);
  16083. + }
  16084. +
  16085. +failed:
  16086. + return err;
  16087. +}
  16088. +
  16089. +/* Similar to tcp_retransmit_timer
  16090. + *
  16091. + * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
  16092. + * and that we don't have an srtt estimation at the meta-level.
  16093. + */
  16094. +void mptcp_retransmit_timer(struct sock *meta_sk)
  16095. +{
  16096. + struct tcp_sock *meta_tp = tcp_sk(meta_sk);
  16097. + struct mptcp_cb *mpcb = meta_tp->mpcb;
  16098. + struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
  16099. + int err;
  16100. +
  16101. + /* In fallback, retransmission is handled at the subflow-level */
  16102. + if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||
  16103. + mpcb->send_infinite_mapping)
  16104. + return;
  16105. +
  16106. + WARN_ON(tcp_write_queue_empty(meta_sk));
  16107. +
  16108. + if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
  16109. + !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
  16110. + /* Receiver dastardly shrinks window. Our retransmits
  16111. + * become zero probes, but we should not timeout this
  16112. + * connection. If the socket is an orphan, time it out,
  16113. + * we cannot allow such beasts to hang infinitely.
  16114. + */
  16115. + struct inet_sock *meta_inet = inet_sk(meta_sk);
  16116. + if (meta_sk->sk_family == AF_INET) {
  16117. + LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
  16118. + &meta_inet->inet_daddr,
  16119. + ntohs(meta_inet->inet_dport),
  16120. + meta_inet->inet_num, meta_tp->snd_una,
  16121. + meta_tp->snd_nxt);
  16122. + }
  16123. +#if IS_ENABLED(CONFIG_IPV6)
  16124. + else if (meta_sk->sk_family == AF_INET6) {
  16125. + LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
  16126. + &meta_sk->sk_v6_daddr,
  16127. + ntohs(meta_inet->inet_dport),
  16128. + meta_inet->inet_num, meta_tp->snd_una,
  16129. + meta_tp->snd_nxt);
  16130. + }
  16131. +#endif
  16132. + if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
  16133. + tcp_write_err(meta_sk);
  16134. + return;
  16135. + }
  16136. +
  16137. + mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
  16138. + goto out_reset_timer;
  16139. + }
  16140. +
  16141. + if (tcp_write_timeout(meta_sk))
  16142. + return;
  16143. +
  16144. + if (meta_icsk->icsk_retransmits == 0)
  16145. + NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
  16146. +
  16147. + meta_icsk->icsk_ca_state = TCP_CA_Loss;
  16148. +
  16149. + err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
  16150. + if (err > 0) {
  16151. + /* Retransmission failed because of local congestion,
  16152. + * do not backoff.
  16153. + */
  16154. + if (!meta_icsk->icsk_retransmits)
  16155. + meta_icsk->icsk_retransmits = 1;
  16156. + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
  16157. + min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
  16158. + TCP_RTO_MAX);
  16159. + return;
  16160. + }
  16161. +
  16162. + /* Increase the timeout each time we retransmit. Note that
  16163. + * we do not increase the rtt estimate. rto is initialized
  16164. + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
  16165. + * that doubling rto each time is the least we can get away with.
  16166. + * In KA9Q, Karn uses this for the first few times, and then
  16167. + * goes to quadratic. netBSD doubles, but only goes up to *64,
  16168. + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
  16169. + * defined in the protocol as the maximum possible RTT. I guess
  16170. + * we'll have to use something other than TCP to talk to the
  16171. + * University of Mars.
  16172. + *
  16173. + * PAWS allows us longer timeouts and large windows, so once
  16174. + * implemented ftp to mars will work nicely. We will have to fix
  16175. + * the 120 second clamps though!
  16176. + */
  16177. + meta_icsk->icsk_backoff++;
  16178. + meta_icsk->icsk_retransmits++;
  16179. +
  16180. +out_reset_timer:
  16181. + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
  16182. + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
  16183. + * might be increased if the stream oscillates between thin and thick,
  16184. + * thus the old value might already be too high compared to the value
  16185. + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
  16186. + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
  16187. + * exponential backoff behaviour to avoid continue hammering
  16188. + * linear-timeout retransmissions into a black hole
  16189. + */
  16190. + if (meta_sk->sk_state == TCP_ESTABLISHED &&
  16191. + (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
  16192. + tcp_stream_is_thin(meta_tp) &&
  16193. + meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
  16194. + meta_icsk->icsk_backoff = 0;
  16195. + /* We cannot do the same as in tcp_write_timer because the
  16196. + * srtt is not set here.
  16197. + */
  16198. + mptcp_set_rto(meta_sk);
  16199. + } else {
  16200. + /* Use normal (exponential) backoff */
  16201. + meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
  16202. + }
  16203. + inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
  16204. +
  16205. + return;
  16206. +}
  16207. +
  16208. +/* Modify values to an mptcp-level for the initial window of new subflows */
  16209. +void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
  16210. + __u32 *window_clamp, int wscale_ok,
  16211. + __u8 *rcv_wscale, __u32 init_rcv_wnd,
  16212. + const struct sock *sk)
  16213. +{
  16214. + struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
  16215. +
  16216. + *window_clamp = mpcb->orig_window_clamp;
  16217. + __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
  16218. +
  16219. + tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,
  16220. + wscale_ok, rcv_wscale, init_rcv_wnd, sk);
  16221. +}
  16222. +
  16223. +unsigned int mptcp_current_mss(struct sock *meta_sk)
  16224. +{
  16225. + unsigned int mss = 0;
  16226. + struct sock *sk;
  16227. +
  16228. + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
  16229. + int this_mss;
  16230. +
  16231. + if (!mptcp_sk_can_send(sk))
  16232. + continue;
  16233. +
  16234. + this_mss = tcp_current_mss(sk);
  16235. + if (this_mss > mss)
  16236. + mss = this_mss;
  16237. + }
  16238. +
  16239. + /* If no subflow is available, we take a default-mss from the
  16240. + * meta-socket.
  16241. + */
  16242. + return !mss ? tcp_current_mss(meta_sk) : mss;
  16243. +}
  16244. +
  16245. +int mptcp_select_size(const struct sock *meta_sk, bool sg)
  16246. +{
  16247. + int mss = 0; /* We look for the smallest MSS */
  16248. + struct sock *sk;
  16249. +
  16250. + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
  16251. + int this_mss;
  16252. +
  16253. + if (!mptcp_sk_can_send(sk))
  16254. + continue;
  16255. +
  16256. + this_mss = tcp_sk(sk)->mss_cache;
  16257. + if (this_mss > mss)
  16258. + mss = this_mss;
  16259. + }
  16260. +
  16261. + if (sg) {
  16262. + if (mptcp_sk_can_gso(meta_sk)) {
  16263. + mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
  16264. + } else {
  16265. + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
  16266. +
  16267. + if (mss >= pgbreak &&
  16268. + mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
  16269. + mss = pgbreak;
  16270. + }
  16271. + }
  16272. +
  16273. + return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
  16274. +}
  16275. +
  16276. +int mptcp_check_snd_buf(const struct tcp_sock *tp)
  16277. +{
  16278. + struct sock *sk;
  16279. + u32 rtt_max = tp->srtt;
  16280. + u64 bw_est;
  16281. +
  16282. + if (!tp->srtt)
  16283. + return tp->reordering + 1;
  16284. +
  16285. + mptcp_for_each_sk(tp->mpcb, sk) {
  16286. + if (!mptcp_sk_can_send(sk))
  16287. + continue;
  16288. +
  16289. + if (rtt_max < tcp_sk(sk)->srtt)
  16290. + rtt_max = tcp_sk(sk)->srtt;
  16291. + }
  16292. +
  16293. + bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
  16294. + (u64)tp->srtt);
  16295. +
  16296. + return max_t(unsigned int, (u32)(bw_est >> 16),
  16297. + tp->reordering + 1);
  16298. +
  16299. +}
  16300. +
  16301. +unsigned int mptcp_xmit_size_goal(struct sock *meta_sk, u32 mss_now,
  16302. + int large_allowed)
  16303. +{
  16304. + struct sock *sk;
  16305. + u32 xmit_size_goal = 0;
  16306. +
  16307. + if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
  16308. + mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
  16309. + int this_size_goal;
  16310. +
  16311. + if (!mptcp_sk_can_send(sk))
  16312. + continue;
  16313. +
  16314. + this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
  16315. + if (this_size_goal > xmit_size_goal)
  16316. + xmit_size_goal = this_size_goal;
  16317. + }
  16318. + }
  16319. +
  16320. + return max(xmit_size_goal, mss_now);
  16321. +}
  16322. +
  16323. +/* Similar to tcp_trim_head - but we correctly copy the DSS-option */
  16324. +int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  16325. +{
  16326. + int dsslen = MPTCP_SUB_LEN_DSS_ALIGN + MPTCP_SUB_LEN_ACK_ALIGN +
  16327. + MPTCP_SUB_LEN_SEQ_ALIGN;
  16328. + char dss[dsslen];
  16329. +
  16330. + /* DSS-option must be recovered afterwards. */
  16331. + memcpy(dss, skb->data - dsslen, dsslen);
  16332. +
  16333. + if (skb_cloned(skb)) {
  16334. + /* pskb_expand_head will delete our DSS-option. We have to copy
  16335. + * it back if pskb_expand_head succeeds.
  16336. + */
  16337. +
  16338. + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
  16339. + return -ENOMEM;
  16340. +
  16341. + memcpy(skb->data - dsslen, dss, dsslen);
  16342. + }
  16343. +
  16344. + __pskb_trim_head(skb, len);
  16345. +
  16346. + /* Put the DSS-option back in our header */
  16347. + memcpy(skb->data - dsslen, dss, dsslen);
  16348. +
  16349. + TCP_SKB_CB(skb)->seq += len;
  16350. + skb->ip_summed = CHECKSUM_PARTIAL;
  16351. +
  16352. + skb->truesize -= len;
  16353. + sk->sk_wmem_queued -= len;
  16354. + sk_mem_uncharge(sk, len);
  16355. + sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
  16356. +
  16357. + /* Any change of skb->len requires recalculation of tso factor. */
  16358. + if (tcp_skb_pcount(skb) > 1)
  16359. + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
  16360. +
  16361. + return 0;
  16362. +}
  16363. diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c
  16364. new file mode 100644
  16365. index 0000000..1f78ae1
  16366. --- /dev/null
  16367. +++ b/net/mptcp/mptcp_pm.c
  16368. @@ -0,0 +1,170 @@
  16369. +/*
  16370. + * MPTCP implementation - MPTCP-subflow-management
  16371. + *
  16372. + * Initial Design & Implementation:
  16373. + * Sébastien Barré <sebastien.barre@uclouvain.be>
  16374. + *
  16375. + * Current Maintainer & Author:
  16376. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  16377. + *
  16378. + * Additional authors:
  16379. + * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
  16380. + * Gregory Detal <gregory.detal@uclouvain.be>
  16381. + * Fabien Duchêne <fabien.duchene@uclouvain.be>
  16382. + * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
  16383. + * Lavkesh Lahngir <lavkesh51@gmail.com>
  16384. + * Andreas Ripke <ripke@neclab.eu>
  16385. + * Vlad Dogaru <vlad.dogaru@intel.com>
  16386. + * Octavian Purdila <octavian.purdila@intel.com>
  16387. + * John Ronan <jronan@tssg.org>
  16388. + * Catalin Nicutar <catalin.nicutar@gmail.com>
  16389. + * Brandon Heller <brandonh@stanford.edu>
  16390. + *
  16391. + *
  16392. + * This program is free software; you can redistribute it and/or
  16393. + * modify it under the terms of the GNU General Public License
  16394. + * as published by the Free Software Foundation; either version
  16395. + * 2 of the License, or (at your option) any later version.
  16396. + */
  16397. +
  16398. +
  16399. +#include <linux/module.h>
  16400. +#include <net/mptcp.h>
  16401. +
  16402. +static DEFINE_SPINLOCK(mptcp_pm_list_lock);
  16403. +static LIST_HEAD(mptcp_pm_list);
  16404. +
  16405. +static int mptcp_default_index(sa_family_t family, union inet_addr *addr,
  16406. + struct net *net)
  16407. +{
  16408. + return 0;
  16409. +}
  16410. +
  16411. +struct mptcp_pm_ops mptcp_pm_default = {
  16412. + .get_local_index = mptcp_default_index,
  16413. + .get_local_id = mptcp_default_index, /* We do not care */
  16414. + .name = "default",
  16415. + .owner = THIS_MODULE,
  16416. +};
  16417. +
  16418. +static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
  16419. +{
  16420. + struct mptcp_pm_ops *e;
  16421. +
  16422. + list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
  16423. + if (strcmp(e->name, name) == 0)
  16424. + return e;
  16425. + }
  16426. +
  16427. + return NULL;
  16428. +}
  16429. +
  16430. +int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
  16431. +{
  16432. + int ret = 0;
  16433. +
  16434. + if (!pm->get_local_index || !pm->get_local_id)
  16435. + return -EINVAL;
  16436. +
  16437. + spin_lock(&mptcp_pm_list_lock);
  16438. + if (mptcp_pm_find(pm->name)) {
  16439. + pr_notice("%s already registered\n", pm->name);
  16440. + ret = -EEXIST;
  16441. + } else {
  16442. + list_add_tail_rcu(&pm->list, &mptcp_pm_list);
  16443. + pr_info("%s registered\n", pm->name);
  16444. + }
  16445. + spin_unlock(&mptcp_pm_list_lock);
  16446. +
  16447. + return ret;
  16448. +}
  16449. +EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
  16450. +
  16451. +void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
  16452. +{
  16453. + spin_lock(&mptcp_pm_list_lock);
  16454. + list_del_rcu(&pm->list);
  16455. + spin_unlock(&mptcp_pm_list_lock);
  16456. +}
  16457. +EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
  16458. +
  16459. +void mptcp_get_default_path_manager(char *name)
  16460. +{
  16461. + struct mptcp_pm_ops *pm;
  16462. +
  16463. + BUG_ON(list_empty(&mptcp_pm_list));
  16464. +
  16465. + rcu_read_lock();
  16466. + pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
  16467. + strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
  16468. + rcu_read_unlock();
  16469. +}
  16470. +
  16471. +int mptcp_set_default_path_manager(const char *name)
  16472. +{
  16473. + struct mptcp_pm_ops *pm;
  16474. + int ret = -ENOENT;
  16475. +
  16476. + spin_lock(&mptcp_pm_list_lock);
  16477. + pm = mptcp_pm_find(name);
  16478. +#ifdef CONFIG_MODULES
  16479. + if (!pm && capable(CAP_NET_ADMIN)) {
  16480. + spin_unlock(&mptcp_pm_list_lock);
  16481. +
  16482. + request_module("mptcp_%s", name);
  16483. + spin_lock(&mptcp_pm_list_lock);
  16484. + pm = mptcp_pm_find(name);
  16485. + }
  16486. +#endif
  16487. +
  16488. + if (pm) {
  16489. + list_move(&pm->list, &mptcp_pm_list);
  16490. + ret = 0;
  16491. + } else {
  16492. + pr_info("%s is not available\n", name);
  16493. + }
  16494. + spin_unlock(&mptcp_pm_list_lock);
  16495. +
  16496. + return ret;
  16497. +}
  16498. +
  16499. +void mptcp_init_path_manager(struct mptcp_cb *mpcb)
  16500. +{
  16501. + struct mptcp_pm_ops *pm;
  16502. +
  16503. + rcu_read_lock();
  16504. + list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
  16505. + if (try_module_get(pm->owner)) {
  16506. + mpcb->pm_ops = pm;
  16507. + break;
  16508. + }
  16509. + }
  16510. + rcu_read_unlock();
  16511. +}
  16512. +
  16513. +/* Manage refcounts on socket close. */
  16514. +void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
  16515. +{
  16516. + module_put(mpcb->pm_ops->owner);
  16517. +}
  16518. +
  16519. +/* Fallback to the default path-manager. */
  16520. +void mptcp_fallback_default(struct mptcp_cb *mpcb)
  16521. +{
  16522. + struct mptcp_pm_ops *pm;
  16523. +
  16524. + mptcp_cleanup_path_manager(mpcb);
  16525. + pm = mptcp_pm_find("default");
  16526. +
  16527. + /* Cannot fail - it's the default module */
  16528. + try_module_get(pm->owner);
  16529. + mpcb->pm_ops = pm;
  16530. +}
  16531. +EXPORT_SYMBOL_GPL(mptcp_fallback_default);
  16532. +
  16533. +/* Set default value from kernel configuration at bootup */
  16534. +static int __init mptcp_path_manager_default(void)
  16535. +{
  16536. + return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
  16537. +}
  16538. +late_initcall(mptcp_path_manager_default);
  16539. diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c
  16540. new file mode 100644
  16541. index 0000000..8e1fd50
  16542. --- /dev/null
  16543. +++ b/net/mptcp/mptcp_wvegas.c
  16544. @@ -0,0 +1,270 @@
  16545. +/*
  16546. + * MPTCP implementation - WEIGHTED VEGAS
  16547. + *
  16548. + * Algorithm design:
  16549. + * Yu Cao <cyAnalyst@126.com>
  16550. + * Mingwei Xu <xmw@csnet1.cs.tsinghua.edu.cn>
  16551. + * Xiaoming Fu <fu@cs.uni-goettinggen.de>
  16552. + *
  16553. + * Implementation:
  16554. + * Yu Cao <cyAnalyst@126.com>
  16555. + * Enhuan Dong <deh13@mails.tsinghua.edu.cn>
  16556. + *
  16557. + * Ported to the official MPTCP-kernel:
  16558. + * Christoph Paasch <christoph.paasch@uclouvain.be>
  16559. + *
  16560. + * This program is free software; you can redistribute it and/or
  16561. + * modify it under the terms of the GNU General Public License
  16562. + * as published by the Free Software Foundation; either version
  16563. + * 2 of the License, or (at your option) any later version.
  16564. + */
  16565. +
  16566. +#include <linux/skbuff.h>
  16567. +#include <net/tcp.h>
  16568. +#include <net/mptcp.h>
  16569. +#include <linux/module.h>
  16570. +#include <linux/tcp.h>
  16571. +
  16572. +static int initial_alpha = 2;
  16573. +static int total_alpha = 10;
  16574. +static int gamma = 1;
  16575. +
  16576. +module_param(initial_alpha, int, 0644);
  16577. +MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
  16578. +module_param(total_alpha, int, 0644);
  16579. +MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
  16580. +module_param(gamma, int, 0644);
  16581. +MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
  16582. +
  16583. +#define MPTCP_WVEGAS_SCALE 16
  16584. +
  16585. +/* wVegas variables */
  16586. +struct wvegas {
  16587. + u32 beg_snd_nxt; /* right edge during last RTT */
  16588. + u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
  16589. +
  16590. + u16 cnt_rtt; /* # of RTTs measured within last RTT */
  16591. + u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
  16592. + u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
  16593. +
  16594. + u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
  16595. + u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
  16596. + int alpha; /* alpha for each subflows */
  16597. +
  16598. + u32 queue_delay; /* queue delay*/
  16599. +};
  16600. +
  16601. +
  16602. +static inline u64 mptcp_wvegas_scale(u32 val, int scale)
  16603. +{
  16604. + return (u64) val << scale;
  16605. +}
  16606. +
  16607. +static void wvegas_enable(struct sock *sk)
  16608. +{
  16609. + const struct tcp_sock *tp = tcp_sk(sk);
  16610. + struct wvegas *wvegas = inet_csk_ca(sk);
  16611. +
  16612. + wvegas->doing_wvegas_now = 1;
  16613. +
  16614. + wvegas->beg_snd_nxt = tp->snd_nxt;
  16615. +
  16616. + wvegas->cnt_rtt = 0;
  16617. + wvegas->sampled_rtt = 0;
  16618. +
  16619. + wvegas->instant_rate = 0;
  16620. + wvegas->alpha = initial_alpha;
  16621. + wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
  16622. +
  16623. + wvegas->queue_delay = 0;
  16624. +}
  16625. +
  16626. +static inline void wvegas_disable(struct sock *sk)
  16627. +{
  16628. + struct wvegas *wvegas = inet_csk_ca(sk);
  16629. +
  16630. + wvegas->doing_wvegas_now = 0;
  16631. +}
  16632. +
  16633. +static void mptcp_wvegas_init(struct sock *sk)
  16634. +{
  16635. + struct wvegas *wvegas = inet_csk_ca(sk);
  16636. +
  16637. + wvegas->base_rtt = 0x7fffffff;
  16638. + wvegas_enable(sk);
  16639. +}
  16640. +
  16641. +static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
  16642. +{
  16643. + return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
  16644. +}
  16645. +
  16646. +static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
  16647. +{
  16648. + struct wvegas *wvegas = inet_csk_ca(sk);
  16649. + u32 vrtt;
  16650. +
  16651. + if (rtt_us < 0)
  16652. + return;
  16653. +
  16654. + vrtt = rtt_us + 1;
  16655. +
  16656. + if (vrtt < wvegas->base_rtt)
  16657. + wvegas->base_rtt = vrtt;
  16658. +
  16659. + wvegas->sampled_rtt += vrtt;
  16660. + wvegas->cnt_rtt++;
  16661. +}
  16662. +
  16663. +static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
  16664. +{
  16665. + if (ca_state == TCP_CA_Open)
  16666. + wvegas_enable(sk);
  16667. + else
  16668. + wvegas_disable(sk);
  16669. +}
  16670. +
  16671. +static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
  16672. +{
  16673. + if (event == CA_EVENT_CWND_RESTART) {
  16674. + mptcp_wvegas_init(sk);
  16675. + } else if (event == CA_EVENT_LOSS) {
  16676. + struct wvegas *wvegas = inet_csk_ca(sk);
  16677. + wvegas->instant_rate = 0;
  16678. + }
  16679. +}
  16680. +
  16681. +static inline u32 mptcp_wvegas_ssthresh(struct tcp_sock *tp)
  16682. +{
  16683. + return min(tp->snd_ssthresh, tp->snd_cwnd - 1);
  16684. +}
  16685. +
  16686. +static u64 mptcp_wvegas_weight(struct mptcp_cb *mpcb, struct sock *sk)
  16687. +{
  16688. + u64 total_rate = 0;
  16689. + struct sock *sub_sk;
  16690. + struct wvegas *wvegas = inet_csk_ca(sk);
  16691. +
  16692. + if (!mpcb)
  16693. + return wvegas->weight;
  16694. +
  16695. +
  16696. + mptcp_for_each_sk(mpcb, sub_sk) {
  16697. + struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
  16698. +
  16699. + /* sampled_rtt is initialized by 0 */
  16700. + if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
  16701. + total_rate += sub_wvegas->instant_rate;
  16702. + }
  16703. +
  16704. + if (total_rate && wvegas->instant_rate)
  16705. + return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
  16706. + else
  16707. + return wvegas->weight;
  16708. +}
  16709. +
  16710. +static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
  16711. +{
  16712. + struct tcp_sock *tp = tcp_sk(sk);
  16713. + struct wvegas *wvegas = inet_csk_ca(sk);
  16714. +
  16715. + if (!wvegas->doing_wvegas_now) {
  16716. + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
  16717. + return;
  16718. + }
  16719. +
  16720. + if (after(ack, wvegas->beg_snd_nxt)) {
  16721. + wvegas->beg_snd_nxt = tp->snd_nxt;
  16722. +
  16723. + if (wvegas->cnt_rtt <= 2) {
  16724. + tcp_reno_cong_avoid(sk, ack, acked, in_flight);
  16725. + } else {
  16726. + u32 rtt, diff, q_delay;
  16727. + u64 target_cwnd;
  16728. +
  16729. + rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
  16730. + target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
  16731. +
  16732. + diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
  16733. +
  16734. + if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
  16735. + tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
  16736. + tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
  16737. +
  16738. + } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
  16739. + tcp_slow_start(tp, acked);
  16740. + } else {
  16741. + if (diff >= wvegas->alpha) {
  16742. + wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
  16743. + wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
  16744. + wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
  16745. + }
  16746. + if (diff > wvegas->alpha) {
  16747. + tp->snd_cwnd--;
  16748. + tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
  16749. + } else if (diff < wvegas->alpha) {
  16750. + tp->snd_cwnd++;
  16751. + }
  16752. +
  16753. + /* Try to drain link queue if needed*/
  16754. + q_delay = rtt - wvegas->base_rtt;
  16755. + if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
  16756. + wvegas->queue_delay = q_delay;
  16757. +
  16758. + if (q_delay >= 2 * wvegas->queue_delay) {
  16759. + u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
  16760. + tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
  16761. + wvegas->queue_delay = 0;
  16762. + }
  16763. + }
  16764. +
  16765. + if (tp->snd_cwnd < 2)
  16766. + tp->snd_cwnd = 2;
  16767. + else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
  16768. + tp->snd_cwnd = tp->snd_cwnd_clamp;
  16769. +
  16770. + tp->snd_ssthresh = tcp_current_ssthresh(sk);
  16771. + }
  16772. +
  16773. + wvegas->cnt_rtt = 0;
  16774. + wvegas->sampled_rtt = 0;
  16775. + }
  16776. + /* Use normal slow start */
  16777. + else if (tp->snd_cwnd <= tp->snd_ssthresh)
  16778. + tcp_slow_start(tp, acked);
  16779. +}
  16780. +
  16781. +
  16782. +static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
  16783. + .flags = TCP_CONG_RTT_STAMP,
  16784. + .init = mptcp_wvegas_init,
  16785. + .ssthresh = tcp_reno_ssthresh,
  16786. + .cong_avoid = mptcp_wvegas_cong_avoid,
  16787. + .min_cwnd = tcp_reno_min_cwnd,
  16788. + .pkts_acked = mptcp_wvegas_pkts_acked,
  16789. + .set_state = mptcp_wvegas_state,
  16790. + .cwnd_event = mptcp_wvegas_cwnd_event,
  16791. +
  16792. + .owner = THIS_MODULE,
  16793. + .name = "wvegas",
  16794. +};
  16795. +
  16796. +static int __init mptcp_wvegas_register(void)
  16797. +{
  16798. + BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
  16799. + tcp_register_congestion_control(&mptcp_wvegas);
  16800. + return 0;
  16801. +}
  16802. +
  16803. +static void __exit mptcp_wvegas_unregister(void)
  16804. +{
  16805. + tcp_unregister_congestion_control(&mptcp_wvegas);
  16806. +}
  16807. +
  16808. +module_init(mptcp_wvegas_register);
  16809. +module_exit(mptcp_wvegas_unregister);
  16810. +
  16811. +MODULE_AUTHOR("Yu Cao, Enhuan Dong");
  16812. +MODULE_LICENSE("GPL");
  16813. +MODULE_DESCRIPTION("MPTCP wVegas");
  16814. +MODULE_VERSION("0.1");